[PATCH 2/2] R600/SI: Implement VGPR register spilling v2
Tom Stellard
thomas.stellard at amd.com
Tue Apr 29 23:31:26 PDT 2014
VGPRs are spilled to LDS.
v2: Only calculate thread id once
---
lib/Target/R600/AMDGPUAsmPrinter.cpp | 3 +-
lib/Target/R600/SIInstrInfo.cpp | 163 +++++++++++++++++++++++++++++-
lib/Target/R600/SIInstrInfo.h | 8 ++
lib/Target/R600/SIInstructions.td | 23 +++++
lib/Target/R600/SIMachineFunctionInfo.cpp | 51 ++++++++--
lib/Target/R600/SIMachineFunctionInfo.h | 10 ++
6 files changed, 247 insertions(+), 11 deletions(-)
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 170f479..4ac702e 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -311,7 +311,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
LDSAlignShift = 9;
}
unsigned LDSBlocks =
- RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+ RoundUpToAlignment(MFI->LDSSize + (MFI->LDSWaveSpillSize * 64),
+ 1 << LDSAlignShift) >> LDSAlignShift;
if (MFI->ShaderType == ShaderType::COMPUTE) {
OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 8cf3f38..038e16b 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -199,7 +199,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addReg(SrcReg, KillFlag)
.addImm(Lane);
MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
- } else {
+ } else if (RI.isSGPRClass(RC)){
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for vector
// registers.
@@ -224,6 +224,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
.addReg(SrcReg)
.addImm(FrameIndex);
+ } else if(RI.hasVGPRs(RC)) {
+ unsigned Opcode;
+ switch(RC->getSize() * 8) {
+ case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
+ case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
+ case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
+ default: llvm_unreachable("Cannot spill register class");
+ }
+ MFI->allocateLDSSpaceForSpill(FrameIndex, RC->getSize());
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MI, DL, get(Opcode), TmpReg)
+ .addReg(MFI->getSpillTIDVirtualReg(MRI, MBB.getParent()))
+ .addReg(SrcReg)
+ .addImm(FrameIndex);
+ } else {
+ llvm_unreachable("Don't know how to spill register class");
}
}
@@ -233,6 +252,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
SIMachineFunctionInfo::SpilledReg Spill =
@@ -259,7 +279,69 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
.addReg(Spill.VGPR)
.addImm(FrameIndex);
insertNOPs(MI, 3);
+ } else if(RI.hasVGPRs(RC)) {
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned Opcode;
+ switch(RC->getSize() * 8) {
+ case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
+ case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
+ case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
+ default: llvm_unreachable("Cannot spill register class");
+ }
+ BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+ .addReg(TmpReg, RegState::Define)
+ .addReg(MFI->getSpillTIDVirtualReg(MRI, MBB.getParent()))
+ .addImm(FrameIndex);
+ }
+}
+
+/// \param @Offset Offset in bytes of the FrameIndex being spilled
+unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned TmpReg,
+ unsigned TIDOffsetReg,
+ unsigned FrameOffset,
+ unsigned Size) const {
+ SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ unsigned ThreadsInWave = 64;
+ unsigned LDSOffset = MFI->LDSSize + (FrameOffset * ThreadsInWave);
+
+ if (!MFI->HasCalculatedTIDOffset) {
+ MachineBasicBlock &Entry = MBB.getParent()->front();
+ MachineBasicBlock::iterator Insert = Entry.front();
+ DebugLoc DL = Insert->getDebugLoc();
+ // Get the wave id
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
+ TIDOffsetReg)
+ .addImm(-1)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0)
+ .addImm(0);
+
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32),
+ TIDOffsetReg)
+ .addImm(-1)
+ .addReg(TIDOffsetReg);
+
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
+ TIDOffsetReg)
+ .addImm(2)
+ .addReg(TIDOffsetReg);
+ MFI->HasCalculatedTIDOffset = true;
}
+
+ // Add FrameIndex to LDS offset
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
+ .addImm(LDSOffset)
+ .addReg(TIDOffsetReg);
+
+ return TmpReg;
}
static unsigned getNumSubRegsForSpillOp(unsigned Op) {
@@ -267,16 +349,30 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
switch (Op) {
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S512_RESTORE:
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V512_RESTORE:
return 16;
case AMDGPU::SI_SPILL_S256_SAVE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
return 8;
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
return 4;
+ case AMDGPU::SI_SPILL_V96_SAVE:
+ case AMDGPU::SI_SPILL_V96_RESTORE:
+ return 3;
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S64_RESTORE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
return 2;
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ return 1;
default: llvm_unreachable("Invalid spill opcode");
}
}
@@ -347,7 +443,72 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
MI->eraseFromParent();
break;
}
+
+ // VGPR register spill to LDS
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE: {
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ unsigned TmpReg = MI->getOperand(0).getReg();
+ unsigned TIDOffsetReg = MI->getOperand(1).getReg();
+ unsigned SrcReg = MI->getOperand(2).getReg();
+ unsigned FrameIndex = MI->getOperand(3).getImm();
+ unsigned Offset = MFI->LDSSpillOffsets[FrameIndex];
+ unsigned Size = NumSubRegs * 4;
+
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
+ unsigned SubReg = NumSubRegs > 1 ?
+ RI.getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) :
+ SrcReg;
+ Offset += (i * 4);
+ unsigned AddrReg = calculateLDSSpillAddress(MBB, MI, TmpReg, TIDOffsetReg,
+ Offset, Size);
+
+ // Store the value in LDS
+ BuildMI(MBB, MI, DL, get(AMDGPU::DS_WRITE_B32))
+ .addImm(0) // gds
+ .addReg(AddrReg) // addr
+ .addReg(SubReg) // data0
+ .addImm(0); // offset
+ }
+
+ MI->eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
+ case AMDGPU::SI_SPILL_V512_RESTORE: {
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned TempReg = MI->getOperand(1).getReg();
+ unsigned TIDOffsetReg = MI->getOperand(2).getReg();
+ unsigned FrameIndex = MI->getOperand(3).getImm();
+ unsigned Offset = MFI->LDSSpillOffsets[FrameIndex];
+ unsigned Size = NumSubRegs * 4;
+
+ // FIXME: We could use DS_READ_B64 here to optimize for larger registers.
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
+ unsigned SubReg = NumSubRegs > 1 ?
+ RI.getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) :
+ DstReg;
+
+ Offset += (i * 4);
+ unsigned AddrReg = calculateLDSSpillAddress(MBB, MI, TempReg, TIDOffsetReg,
+ Offset, Size);
+ BuildMI(MBB, MI, DL, get(AMDGPU::DS_READ_B32), SubReg)
+ .addImm(0) // gds
+ .addReg(AddrReg) // addr
+ .addImm(0); //offset
+ }
+ MI->eraseFromParent();
+ break;
}
+ }
+
return true;
}
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 59ebc35..a2fb473 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -48,6 +48,14 @@ private:
MachineInstr *Inst, unsigned Opcode) const;
void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
+ unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned TmpReg,
+ unsigned TIDOffsetReg,
+ unsigned Offset,
+ unsigned Size) const;
+
+
public:
explicit SIInstrInfo(AMDGPUTargetMachine &tm);
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index b93de36..1cd60fc 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1595,6 +1595,29 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+ def _SAVE : InstSI <
+ (outs VGPR_32:$temp),
+ (ins VGPR_32:$tid, vgpr_class:$src, i32imm:$frame_idx),
+ "", []
+ > {
+ let Constraints = "@earlyclobber $temp";
+ }
+
+ def _RESTORE : InstSI <
+ (outs vgpr_class:$dst, VGPR_32:$temp),
+ (ins VGPR_32:$tid, i32imm:$frame_idx),
+ "", []
+ >;
+}
+
+defm SI_SPILL_V32 : SI_SPILL_VGPR <VReg_32>;
+defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
+defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
+defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+
} // end IsCodeGenOnly, isPseudo
def : Pat<
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index af60995..e4a39f6 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -12,6 +12,7 @@
#include "SIMachineFunctionInfo.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
@@ -26,8 +27,22 @@ void SIMachineFunctionInfo::anchor() {}
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
+ SpillTIDVirtualReg(AMDGPU::NoRegister),
PSInputAddr(0),
- SpillTracker() { }
+ SpillTracker(),
+ LDSWaveSpillSize(0),
+ HasCalculatedTIDOffset(false) { }
+
+static void addFunctionLiveOut(unsigned Reg, MachineFunction *MF) {
+ for (MachineBasicBlock &MBB : *MF) {
+ if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
+ MBB.back().addOperand(*MF, MachineOperand::CreateReg(Reg, false, true));
+ return;
+ }
+ }
+ MF->getFunction()->getContext().emitError(
+ "Could not found S_ENGPGM instrtuction.");
+}
static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
@@ -55,15 +70,8 @@ static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
//
// To work around this, we add Lane VGPRs to the functions live out list,
// so that we can guarantee its live range will cover all of its uses.
+ addFunctionLiveOut(VGPR, MF);
- for (MachineBasicBlock &MBB : *MF) {
- if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
- MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
- return VGPR;
- }
- }
- MF->getFunction()->getContext().emitError(
- "Could not found S_ENGPGM instrtuction.");
return VGPR;
}
@@ -92,3 +100,28 @@ const SIMachineFunctionInfo::SpilledReg&
SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) {
return SpilledRegisters[FrameIndex];
}
+
+unsigned SIMachineFunctionInfo::getSpillTIDVirtualReg(
+ MachineRegisterInfo &MRI,
+ MachineFunction *MF) {
+ if (SpillTIDVirtualReg == AMDGPU::NoRegister) {
+ SpillTIDVirtualReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+ MachineBasicBlock &Entry = MF->front();
+ MachineBasicBlock::iterator Insert = Entry.front();
+ Insert->addOperand(MachineOperand::CreateReg(SpillTIDVirtualReg,
+ true, true));
+ addFunctionLiveOut(SpillTIDVirtualReg, MF);
+ }
+
+ return SpillTIDVirtualReg;
+}
+
+unsigned SIMachineFunctionInfo::allocateLDSSpaceForSpill(unsigned FrameIndex,
+ unsigned NumBytes) {
+ if (!LDSSpillOffsets.count(FrameIndex)) {
+ LDSSpillOffsets[FrameIndex] = LDSWaveSpillSize;
+ LDSWaveSpillSize += NumBytes;
+ }
+
+ return LDSSpillOffsets[FrameIndex];
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 8eb35c9..5eb05d1 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -26,6 +26,9 @@ class MachineRegisterInfo;
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
virtual void anchor();
+
+ unsigned SpillTIDVirtualReg;
+
public:
struct SpilledReg {
@@ -59,6 +62,13 @@ public:
SIMachineFunctionInfo(const MachineFunction &MF);
unsigned PSInputAddr;
struct RegSpillTracker SpillTracker;
+ unsigned LDSWaveSpillSize;
+ bool HasCalculatedTIDOffset;
+ /// Key is FrameIndex, value is byte offset
+ std::map<unsigned, unsigned> LDSSpillOffsets;
+ unsigned getSpillTIDVirtualReg(MachineRegisterInfo &MRI, MachineFunction *MF);
+ /// Returns the wave local offset for this \p FrameIndex
+ unsigned allocateLDSSpaceForSpill(unsigned FrameIndex, unsigned NumBytes);
};
} // End namespace llvm
--
1.8.1.5
More information about the llvm-commits
mailing list