[llvm] r287942 - AMDGPU/SI: Add back reverted SGPR spilling code, but disable it
Marek Olsak via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 25 09:37:09 PST 2016
Author: mareko
Date: Fri Nov 25 11:37:09 2016
New Revision: 287942
URL: http://llvm.org/viewvc/llvm-project?rev=287942&view=rev
Log:
AMDGPU/SI: Add back reverted SGPR spilling code, but disable it
suggested as a better solution by Matt
Added:
llvm/trunk/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td
llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll
llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir
llvm/trunk/test/CodeGen/AMDGPU/inline-constraints.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
llvm/trunk/test/CodeGen/AMDGPU/read_register.ll
llvm/trunk/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
llvm/trunk/test/CodeGen/MIR/AMDGPU/si-fix-sgpr-copies.mir
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp Fri Nov 25 11:37:09 2016
@@ -253,7 +253,7 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0
static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
switch (NumVectorElts) {
case 1:
- return AMDGPU::SReg_32RegClassID;
+ return AMDGPU::SReg_32_XM0RegClassID;
case 2:
return AMDGPU::SReg_64RegClassID;
case 4:
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri Nov 25 11:37:09 2016
@@ -59,7 +59,7 @@ SITargetLowering::SITargetLowering(const
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
@@ -79,8 +79,8 @@ SITargetLowering::SITargetLowering(const
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
if (Subtarget->has16BitInsts()) {
- addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
- addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
}
computeRegisterProperties(STI.getRegisterInfo());
@@ -941,25 +941,25 @@ SDValue SITargetLowering::LowerFormalArg
// Start adding system SGPRs.
if (Info->hasWorkGroupIDX()) {
unsigned Reg = Info->addWorkGroupIDX();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupIDY()) {
unsigned Reg = Info->addWorkGroupIDY();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupIDZ()) {
unsigned Reg = Info->addWorkGroupIDZ();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupInfo()) {
unsigned Reg = Info->addWorkGroupInfo();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
@@ -2414,15 +2414,15 @@ SDValue SITargetLowering::LowerINTRINSIC
SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::amdgcn_workgroup_id_x:
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
@@ -4182,7 +4182,7 @@ SITargetLowering::getRegForInlineAsmCons
default:
return std::make_pair(0U, nullptr);
case 32:
- return std::make_pair(0U, &AMDGPU::SReg_32RegClass);
+ return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
case 64:
return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
case 128:
Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp Fri Nov 25 11:37:09 2016
@@ -532,6 +532,7 @@ bool SIInsertWaits::runOnMachineFunction
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
IV = getIsaVersion(ST->getFeatureBits());
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
HardwareLimits.Named.VM = getVmcntBitMask(IV);
HardwareLimits.Named.EXP = getExpcntBitMask(IV);
@@ -543,20 +544,27 @@ bool SIInsertWaits::runOnMachineFunction
LastOpcodeType = OTHER;
LastInstWritesM0 = false;
IsFlatOutstanding = false;
- ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
+ ReturnsVoid = MFI->returnsVoid();
memset(&UsedRegs, 0, sizeof(UsedRegs));
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
SmallVector<MachineInstr *, 4> RemoveMI;
+ SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+
+ bool HaveScalarStores = false;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
+
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
+ if (!HaveScalarStores && TII->isScalarStore(*I))
+ HaveScalarStores = true;
+
if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
// There is a hardware bug on CI/SI where SMRD instruction may corrupt
// vccz bit, so when we detect that an instruction may read from a
@@ -625,12 +633,45 @@ bool SIInsertWaits::runOnMachineFunction
pushInstruction(MBB, I, Increment);
handleSendMsg(MBB, I);
+
+ if (I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN)
+ EndPgmBlocks.push_back(&MBB);
}
// Wait for everything at the end of the MBB
Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
}
+ if (HaveScalarStores) {
+ // If scalar writes are used, the cache must be flushed or else the next
+ // wave to reuse the same scratch memory can be clobbered.
+ //
+ // Insert s_dcache_wb at wave termination points if there were any scalar
+ // stores, and only if the cache hasn't already been flushed. This could be
+ // improved by looking across blocks for flushes in postdominating blocks
+ // from the stores but an explicitly requested flush is probably very rare.
+ for (MachineBasicBlock *MBB : EndPgmBlocks) {
+ bool SeenDCacheWB = false;
+
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+
+ if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+ SeenDCacheWB = true;
+ else if (TII->isScalarStore(*I))
+ SeenDCacheWB = false;
+
+ // FIXME: It would be better to insert this before a waitcnt if any.
+ if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
+ Changes = true;
+ BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+ }
+ }
+ }
+ }
+
for (MachineInstr *I : RemoveMI)
I->eraseFromParent();
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Fri Nov 25 11:37:09 2016
@@ -364,7 +364,8 @@ void SIInstrInfo::copyPhysReg(MachineBas
return;
}
- if (RC == &AMDGPU::SReg_32RegClass) {
+ if (RC == &AMDGPU::SReg_32_XM0RegClass ||
+ RC == &AMDGPU::SReg_32RegClass) {
if (SrcReg == AMDGPU::SCC) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
.addImm(-1)
@@ -544,7 +545,7 @@ void SIInstrInfo::storeRegToStackSlot(Ma
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
}
- BuildMI(MBB, MI, DL, OpDesc)
+ MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
@@ -554,6 +555,11 @@ void SIInstrInfo::storeRegToStackSlot(Ma
// needing them, and need to ensure that the reserved registers are
// correctly handled.
+ if (ST.hasScalarStores()) {
+ // m0 is used for offset to scalar stores if used to spill.
+ Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+ }
+
return;
}
@@ -643,12 +649,17 @@ void SIInstrInfo::loadRegFromStackSlot(M
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
- BuildMI(MBB, MI, DL, OpDesc, DestReg)
+ MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
.addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+ if (ST.hasScalarStores()) {
+ // m0 is used for offset to scalar stores if used to spill.
+ Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+ }
+
return;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Fri Nov 25 11:37:09 2016
@@ -24,6 +24,12 @@
using namespace llvm;
+static cl::opt<bool> EnableSpillSGPRToSMEM(
+ "amdgpu-spill-sgpr-to-smem",
+ cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
+ cl::init(false));
+
+
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
for (unsigned i = 0; PSets[i] != -1; ++i) {
if (PSets[i] == (int)PSetID)
@@ -237,7 +243,7 @@ void SIRegisterInfo::materializeFrameBas
MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(Offset);
@@ -401,28 +407,36 @@ static bool buildMUBUFOffsetLoadStore(co
void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
- const MachineOperand *SrcDst,
+ int Index,
+ unsigned ValueReg,
+ bool IsKill,
unsigned ScratchRsrcReg,
- unsigned ScratchOffset,
- int64_t Offset,
+ unsigned ScratchOffsetReg,
+ int64_t InstOffset,
+ MachineMemOperand *MMO,
RegScavenger *RS) const {
- unsigned Value = SrcDst->getReg();
- bool IsKill = SrcDst->isKill();
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
- DebugLoc DL = MI->getDebugLoc();
- bool IsStore = MI->mayStore();
+ const MCInstrDesc &Desc = TII->get(LoadStoreOp);
+ const DebugLoc &DL = MI->getDebugLoc();
+ bool IsStore = Desc.mayStore();
bool RanOutOfSGPRs = false;
bool Scavenged = false;
- unsigned SOffset = ScratchOffset;
- unsigned OriginalImmOffset = Offset;
+ unsigned SOffset = ScratchOffsetReg;
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
+ unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
unsigned Size = NumSubRegs * 4;
+ int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+ const int64_t OriginalImmOffset = Offset;
+
+ unsigned Align = MFI.getObjectAlignment(Index);
+ const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
if (!isUInt<12>(Offset + Size)) {
SOffset = AMDGPU::NoRegister;
@@ -441,19 +455,23 @@ void SIRegisterInfo::buildSpillLoadStore
// subtract the offset after the spill to return ScratchOffset to it's
// original value.
RanOutOfSGPRs = true;
- SOffset = ScratchOffset;
+ SOffset = ScratchOffsetReg;
} else {
Scavenged = true;
}
+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
- .addReg(ScratchOffset)
- .addImm(Offset);
+ .addReg(ScratchOffsetReg)
+ .addImm(Offset);
+
Offset = 0;
}
- for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
+ const unsigned EltSize = 4;
+
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
unsigned SubReg = NumSubRegs == 1 ?
- Value : getSubReg(Value, getSubRegFromChannel(i));
+ ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -463,40 +481,65 @@ void SIRegisterInfo::buildSpillLoadStore
SrcDstRegState |= getKillRegState(IsKill);
}
- BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
- .addReg(SubReg, getDefRegState(!IsStore))
+ MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+ MachineMemOperand *NewMMO
+ = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+ EltSize, MinAlign(Align, EltSize * i));
+
+ auto MIB = BuildMI(*MBB, MI, DL, Desc)
+ .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
.addReg(ScratchRsrcReg)
.addReg(SOffset, SOffsetRegState)
.addImm(Offset)
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // tfe
- .addReg(Value, RegState::Implicit | SrcDstRegState)
- .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ .addMemOperand(NewMMO);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
}
+
if (RanOutOfSGPRs) {
// Subtract the offset we added to the ScratchOffset register.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
- .addReg(ScratchOffset)
- .addImm(OriginalImmOffset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
+ .addReg(ScratchOffsetReg)
+ .addImm(OriginalImmOffset);
}
}
void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS) const {
- MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- const DebugLoc &DL = MI->getDebugLoc();
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
unsigned SuperReg = MI->getOperand(0).getReg();
bool IsKill = MI->getOperand(0).isKill();
+ const DebugLoc &DL = MI->getDebugLoc();
+
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+
+ bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+ const unsigned EltSize = 4;
+ unsigned OffsetReg = AMDGPU::M0;
+ unsigned M0CopyReg = AMDGPU::NoRegister;
+
+ if (SpillToSMEM) {
+ if (RS->isRegUsed(AMDGPU::M0)) {
+ M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+ .addReg(AMDGPU::M0);
+ }
+ }
// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
@@ -504,21 +547,43 @@ void SIRegisterInfo::spillSGPR(MachineBa
unsigned SubReg = NumSubRegs == 1 ?
SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
+ if (SpillToSMEM) {
+ int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ EltSize, MinAlign(Align, EltSize * i));
+
+ // Add i * 4 wave offset.
+ //
+ // SMEM instructions only support a single offset, so increment the wave
+ // offset.
+
+ int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
+ if (Offset != 0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(Offset);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR))
+ .addReg(SubReg, getKillRegState(IsKill)) // sdata
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addMemOperand(MMO);
+
+ continue;
+ }
+
struct SIMachineFunctionInfo::SpilledReg Spill =
MFI->getSpilledReg(MF, Index, i);
if (Spill.hasReg()) {
- if (SuperReg == AMDGPU::M0) {
- assert(NumSubRegs == 1);
- unsigned CopyM0
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), CopyM0)
- .addReg(SuperReg, getKillRegState(IsKill));
-
- // The real spill now kills the temp copy.
- SubReg = SuperReg = CopyM0;
- IsKill = true;
- }
-
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill.VGPR)
@@ -530,10 +595,9 @@ void SIRegisterInfo::spillSGPR(MachineBa
// it are fixed.
} else {
// Spill SGPR to a frame index.
- // FIXME we should use S_STORE_DWORD here for VI.
-
// TODO: Should VI try to spill to VGPR and then spill to SMEM?
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ // TODO: Should VI try to spill to VGPR and then spill to SMEM?
MachineInstrBuilder Mov
= BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
@@ -550,13 +614,12 @@ void SIRegisterInfo::spillSGPR(MachineBa
Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
}
- unsigned Size = FrameInfo.getObjectSize(Index);
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- Size, Align);
+ EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
.addReg(TmpReg, RegState::Kill) // src
.addFrameIndex(Index) // vaddr
@@ -567,6 +630,11 @@ void SIRegisterInfo::spillSGPR(MachineBa
}
}
+ if (M0CopyReg != AMDGPU::NoRegister) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0CopyReg, RegState::Kill);
+ }
+
MI->eraseFromParent();
MFI->addToSpilledSGPRs(NumSubRegs);
}
@@ -585,42 +653,86 @@ void SIRegisterInfo::restoreSGPR(Machine
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
unsigned SuperReg = MI->getOperand(0).getReg();
+ bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
- // m0 is not allowed as with readlane/writelane, so a temporary SGPR and
- // extra copy is needed.
- bool IsM0 = (SuperReg == AMDGPU::M0);
- if (IsM0) {
- assert(NumSubRegs == 1);
- SuperReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+ unsigned OffsetReg = AMDGPU::M0;
+ unsigned M0CopyReg = AMDGPU::NoRegister;
+
+ if (SpillToSMEM) {
+ if (RS->isRegUsed(AMDGPU::M0)) {
+ M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+ .addReg(AMDGPU::M0);
+ }
}
+ // SubReg carries the "Kill" flag when SubReg == SuperReg.
+ int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+
+ const unsigned EltSize = 4;
+
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
unsigned SubReg = NumSubRegs == 1 ?
SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
+ if (SpillToSMEM) {
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+ EltSize, MinAlign(Align, EltSize * i));
+
+ // Add i * 4 offset
+ int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
+ if (Offset != 0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(Offset);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg)
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addMemOperand(MMO);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+
+ continue;
+ }
+
SIMachineFunctionInfo::SpilledReg Spill
= MFI->getSpilledReg(MF, Index, i);
if (Spill.hasReg()) {
- BuildMI(*MBB, MI, DL,
- TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
- SubReg)
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ SubReg)
.addReg(Spill.VGPR)
- .addImm(Spill.Lane)
- .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+ .addImm(Spill.Lane);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
} else {
// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
-
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned Align = FrameInfo.getObjectAlignment(Index);
- unsigned Size = FrameInfo.getObjectSize(Index);
MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad, EltSize,
+ MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
.addFrameIndex(Index) // vaddr
@@ -628,16 +740,19 @@ void SIRegisterInfo::restoreSGPR(Machine
.addReg(MFI->getScratchWaveOffsetReg()) // soffset
.addImm(i * 4) // offset
.addMemOperand(MMO);
- BuildMI(*MBB, MI, DL,
- TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
- .addReg(TmpReg, RegState::Kill)
- .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
+ .addReg(TmpReg, RegState::Kill);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
}
}
- if (IsM0 && SuperReg != AMDGPU::M0) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(SuperReg);
+ if (M0CopyReg != AMDGPU::NoRegister) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0CopyReg, RegState::Kill);
}
MI->eraseFromParent();
@@ -685,28 +800,38 @@ void SIRegisterInfo::eliminateFrameIndex
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
- case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE: {
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
- TII->getNamedOperand(*MI, AMDGPU::OpName::vdata),
+ Index,
+ VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
- FrameInfo.getObjectOffset(Index) +
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
MI->eraseFromParent();
break;
+ }
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_V512_RESTORE: {
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
+
buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
- TII->getNamedOperand(*MI, AMDGPU::OpName::vdata),
+ Index,
+ VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
- FrameInfo.getObjectOffset(Index) +
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
MI->eraseFromParent();
break;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h Fri Nov 25 11:37:09 2016
@@ -253,9 +253,14 @@ public:
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
- unsigned LoadStoreOp, const MachineOperand *SrcDst,
- unsigned ScratchRsrcReg, unsigned ScratchOffset,
- int64_t Offset,
+ unsigned LoadStoreOp,
+ int Index,
+ unsigned ValueReg,
+ bool ValueIsKill,
+ unsigned ScratchRsrcReg,
+ unsigned ScratchOffsetReg,
+ int64_t InstrOffset,
+ MachineMemOperand *MMO,
RegScavenger *RS) const;
};
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td Fri Nov 25 11:37:09 2016
@@ -120,6 +120,11 @@ def SCC_CLASS : RegisterClass<"AMDGPU",
let isAllocatable = 0;
}
+def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
+ let CopyCost = 1;
+ let isAllocatable = 0;
+}
+
// TODO: Do we need to set DwarfRegAlias on register tuples?
// SGPR 32-bit registers
@@ -259,8 +264,9 @@ def SReg_32_XM0 : RegisterClass<"AMDGPU"
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
- (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> {
+ (add SReg_32_XM0, M0_CLASS)> {
let AllocationPriority = 1;
+ let isAllocatable = 0;
}
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
Modified: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp Fri Nov 25 11:37:09 2016
@@ -437,7 +437,7 @@ bool SIWholeQuadMode::requiresCorrectSta
MachineBasicBlock::iterator
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before) {
- unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MachineInstr *Save =
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll Fri Nov 25 11:37:09 2016
@@ -1,16 +1,20 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
-; CHECK-LABEL: {{^}}max_14_sgprs:
+; If spilling to smem, additional registers are used for the resource
+; descriptor.
+
+; ALL-LABEL: {{^}}max_14_sgprs:
; FIXME: Should be ablo to skip this copying of the private segment
; buffer because all the SGPR spills are to VGPRs.
-; CHECK: s_mov_b64 s[6:7], s[2:3]
-; CHECK: s_mov_b64 s[4:5], s[0:1]
-
-; CHECK: SGPRBlocks: 1
-; CHECK: NumSGPRsForWavesPerEU: 14
+; ALL: s_mov_b64 s[6:7], s[2:3]
+; ALL: s_mov_b64 s[4:5], s[0:1]
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 14
define void @max_14_sgprs(i32 addrspace(1)* %out1,
+
i32 addrspace(1)* %out2,
i32 addrspace(1)* %out3,
i32 addrspace(1)* %out4,
@@ -31,7 +35,7 @@ define void @max_14_sgprs(i32 addrspace(
; ---------------------
; total: 14
-; + reserved vcc, flat_scratch = 18
+; + reserved vcc, xnack, flat_scratch = 20
; Because we can't handle re-using the last few input registers as the
; special vcc etc. registers (as well as decide to not use the unused
@@ -40,14 +44,14 @@ define void @max_14_sgprs(i32 addrspace(
; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
; TOSGPR: SGPRBlocks: 2
-; TOSGPR: NumSGPRsForWavesPerEU: 18
+; TOSGPR: NumSGPRsForWavesPerEU: 20
; TOSMEM: s_mov_b64 s[6:7], s[2:3]
-; TOSMEM: s_mov_b32 s9, s13
; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+; TOSMEM: s_mov_b32 s3, s13
; TOSMEM: SGPRBlocks: 2
-; TOSMEM: NumSGPRsForWavesPerEU: 18
+; TOSMEM: NumSGPRsForWavesPerEU: 20
define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
i32 addrspace(1)* %out2,
i32 addrspace(1)* %out3,
@@ -79,12 +83,12 @@ define void @max_12_sgprs_14_input_sgprs
; ; swapping the order the registers are copied from what normally
; ; happens.
-; TOSMEM: s_mov_b64 s[6:7], s[2:3]
-; TOSMEM: s_mov_b64 s[4:5], s[0:1]
-; TOSMEM: s_mov_b32 s3, s11
+; TOSMEM: s_mov_b32 s5, s11
+; TOSMEM: s_add_u32 m0, s5,
+; TOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0
-; ALL: SGPRBlocks: 1
-; ALL: NumSGPRsForWavesPerEU: 16
+; ALL: SGPRBlocks: 2
+; ALL: NumSGPRsForWavesPerEU: 18
define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
i32 addrspace(1)* %out2,
i32 addrspace(1)* %out3,
Modified: llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll Fri Nov 25 11:37:09 2016
@@ -1,5 +1,5 @@
; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
Modified: llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll Fri Nov 25 11:37:09 2016
@@ -26,9 +26,9 @@
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
@@ -55,11 +55,11 @@
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
@@ -108,9 +108,9 @@ endif:
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
@@ -133,11 +133,11 @@ endif:
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
@@ -187,9 +187,9 @@ end:
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_mov_b64 exec, [[CMP0]]
; GCN: s_waitcnt vmcnt(0) expcnt(0)
@@ -208,7 +208,7 @@ end:
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]]
@@ -224,9 +224,9 @@ end:
; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
@@ -255,11 +255,11 @@ end:
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/detect-dead-lanes.mir Fri Nov 25 11:37:09 2016
@@ -27,9 +27,9 @@
# CHECK: S_NOP 0, implicit undef %5.sub0
name: test0
registers:
- - { id: 0, class: sreg_32 }
- - { id: 1, class: sreg_32 }
- - { id: 2, class: sreg_32 }
+ - { id: 0, class: sreg_32_xm0 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sreg_32_xm0 }
- { id: 3, class: sreg_128 }
- { id: 4, class: sreg_64 }
- { id: 5, class: sreg_64 }
@@ -87,13 +87,13 @@ registers:
- { id: 0, class: sreg_128 }
- { id: 1, class: sreg_128 }
- { id: 2, class: sreg_64 }
- - { id: 3, class: sreg_32 }
+ - { id: 3, class: sreg_32_xm0 }
- { id: 4, class: sreg_128 }
- { id: 5, class: sreg_64 }
- - { id: 6, class: sreg_32 }
- - { id: 7, class: sreg_32 }
+ - { id: 6, class: sreg_32_xm0 }
+ - { id: 7, class: sreg_32_xm0 }
- { id: 8, class: sreg_64 }
- - { id: 9, class: sreg_32 }
+ - { id: 9, class: sreg_32_xm0 }
- { id: 10, class: sreg_128 }
body: |
bb.0:
@@ -162,12 +162,12 @@ body: |
name: test2
registers:
- - { id: 0, class: sreg_32 }
- - { id: 1, class: sreg_32 }
+ - { id: 0, class: sreg_32_xm0 }
+ - { id: 1, class: sreg_32_xm0 }
- { id: 2, class: sreg_64 }
- { id: 3, class: sreg_128 }
- - { id: 4, class: sreg_32 }
- - { id: 5, class: sreg_32 }
+ - { id: 4, class: sreg_32_xm0 }
+ - { id: 5, class: sreg_32_xm0 }
- { id: 6, class: sreg_64 }
- { id: 7, class: sreg_128 }
- { id: 8, class: sreg_64 }
@@ -260,7 +260,7 @@ body: |
name: test5
tracksRegLiveness: true
registers:
- - { id: 0, class: sreg_32 }
+ - { id: 0, class: sreg_32_xm0 }
- { id: 1, class: sreg_64 }
body: |
bb.0:
@@ -286,9 +286,9 @@ body: |
name: loop0
tracksRegLiveness: true
registers:
- - { id: 0, class: sreg_32 }
- - { id: 1, class: sreg_32 }
- - { id: 2, class: sreg_32 }
+ - { id: 0, class: sreg_32_xm0 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sreg_32_xm0 }
- { id: 3, class: sreg_128 }
- { id: 4, class: sreg_128 }
- { id: 5, class: sreg_128 }
@@ -339,10 +339,10 @@ body: |
name: loop1
tracksRegLiveness: true
registers:
- - { id: 0, class: sreg_32 }
- - { id: 1, class: sreg_32 }
- - { id: 2, class: sreg_32 }
- - { id: 3, class: sreg_32 }
+ - { id: 0, class: sreg_32_xm0 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sreg_32_xm0 }
+ - { id: 3, class: sreg_32_xm0 }
- { id: 4, class: sreg_128 }
- { id: 5, class: sreg_128 }
- { id: 6, class: sreg_128 }
@@ -390,7 +390,7 @@ body: |
name: loop2
tracksRegLiveness: true
registers:
- - { id: 0, class: sreg_32 }
+ - { id: 0, class: sreg_32_xm0 }
- { id: 1, class: sreg_128 }
- { id: 2, class: sreg_128 }
- { id: 3, class: sreg_128 }
Modified: llvm/trunk/test/CodeGen/AMDGPU/inline-constraints.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/inline-constraints.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/inline-constraints.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/inline-constraints.ll Fri Nov 25 11:37:09 2016
@@ -22,10 +22,11 @@ entry:
ret void
}
+; FIXME: Should be able to avoid copy
; GCN-LABEL: {{^}}inline_sreg_constraint_m0:
; GCN: s_mov_b32 m0, -1
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, m0
-; GCN: ; use m0
+; GCN: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
+; GCN: ; use [[COPY_M0]]
define void @inline_sreg_constraint_m0() {
%m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"()
tail call void asm sideeffect "; use $0", "s"(i32 %m0)
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll Fri Nov 25 11:37:09 2016
@@ -22,7 +22,8 @@ define void @test_readfirstlane_imm(i32
; TODO: m0 should be folded.
; CHECK-LABEL: {{^}}test_readfirstlane_m0:
; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
+; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
+; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
define void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
%m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll Fri Nov 25 11:37:09 2016
@@ -22,7 +22,8 @@ define void @test_readlane_imm_sreg(i32
; TODO: m0 should be folded.
; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
+; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
+; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
%m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
Modified: llvm/trunk/test/CodeGen/AMDGPU/read_register.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/read_register.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/read_register.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/read_register.ll Fri Nov 25 11:37:09 2016
@@ -3,9 +3,11 @@
declare i32 @llvm.read_register.i32(metadata) #0
declare i64 @llvm.read_register.i64(metadata) #0
+; FIXME: Should be able to eliminate copy
; CHECK-LABEL: {{^}}test_read_m0:
; CHECK: s_mov_b32 m0, -1
-; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], m0
+; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
+; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], [[COPY_M0]]
; CHECK: buffer_store_dword [[COPY]]
define void @test_read_m0(i32 addrspace(1)* %out) #0 {
store volatile i32 0, i32 addrspace(3)* undef
Modified: llvm/trunk/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll Fri Nov 25 11:37:09 2016
@@ -1,14 +1,44 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s
; Make sure this doesn't crash.
-; CHECK: {{^}}test:
+; ALL-LABEL: {{^}}test:
+; ALL: s_mov_b32 s92, SCRATCH_RSRC_DWORD0
+; ALL: s_mov_b32 s91, s3
+
; Make sure we are handling hazards correctly.
-; CHECK: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
-; CHECK-NEXT: s_nop 4
-; CHECK-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
-; CHECK: s_endpgm
+; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
+; SGPR-NEXT: s_waitcnt vmcnt(0)
+; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
+; SGPR-NEXT: s_nop 4
+; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0
+
+
+; Make sure scratch wave offset register is correctly incremented and
+; then restored.
+; SMEM: s_mov_b32 m0, s91{{$}}
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
+; SMEM: s_add_u32 m0, s91, 0x100{{$}}
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
+; SMEM: s_add_u32 m0, s91, 0x200{{$}}
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
+; SMEM: s_add_u32 m0, s91, 0x300{{$}}
+; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill
+
+
+; SMEM: s_mov_b32 m0, s91{{$}}
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
+; SMEM: s_add_u32 m0, s91, 0x100{{$}}
+; SMEM: s_waitcnt lgkmcnt(0)
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
+; SMEM: s_add_u32 m0, s91, 0x200{{$}}
+; SMEM: s_waitcnt lgkmcnt(0)
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
+; SMEM: s_add_u32 m0, s91, 0x300{{$}}
+; SMEM: s_waitcnt lgkmcnt(0)
+; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload
+
+; ALL: s_endpgm
define void @test(i32 addrspace(1)* %out, i32 %in) {
call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll Fri Nov 25 11:37:09 2016
@@ -1,33 +1,47 @@
; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s
; XXX - Why does it like to use vcc?
; GCN-LABEL: {{^}}spill_m0:
-; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; TOSMEM: s_mov_b32 s84, SCRATCH_RSRC_DWORD0
-; GCN: s_cmp_lg_u32
+; GCN-DAG: s_cmp_lg_u32
-; TOVGPR: s_mov_b32 vcc_hi, m0
-; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], vcc_hi, 0
+; TOVGPR-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
+; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0
-; TOVMEM: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], m0
+; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
+; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill
; TOVMEM: s_waitcnt vmcnt(0)
+
+; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
+; TOSMEM: s_mov_b32 m0, s3{{$}}
+; TOSMEM-NOT: [[M0_COPY]]
+; TOSMEM: s_buffer_store_dword [[M0_COPY]], s[84:87], m0 ; 4-byte Folded Spill
+; TOSMEM: s_waitcnt lgkmcnt(0)
+
; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
; GCN: [[ENDIF]]:
-; TOVGPR: v_readlane_b32 vcc_hi, [[SPILL_VREG]], 0
-; TOVGPR: s_mov_b32 m0, vcc_hi
+; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 0
+; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]
; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload
; TOVMEM: s_waitcnt vmcnt(0)
-; TOVMEM: v_readfirstlane_b32 vcc_hi, [[RELOAD_VREG]]
-; TOVMEM: s_mov_b32 m0, vcc_hi
+; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]]
+; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
+
+; TOSMEM: s_mov_b32 m0, s3{{$}}
+; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s[84:87], m0 ; 4-byte Folded Reload
+; TOSMEM-NOT: [[M0_RESTORE]]
+; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]]
-; GCN: s_add_i32 m0, m0, 1
+; GCN: s_add_i32 s{{[0-9]+}}, m0, 1
define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
@@ -47,7 +61,33 @@ endif:
@lds = internal addrspace(3) global [64 x float] undef
; GCN-LABEL: {{^}}spill_m0_lds:
+; GCN: s_mov_b32 m0, s6
+; GCN: v_interp_mov_f32
+
+; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_mov_b32 m0, s7
+; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
+; TOSMEM: s_mov_b32 m0, vcc_hi
+
+; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_add_u32 m0, s7, 0x100
+; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
+; TOSMEM: s_add_u32 m0, s7, 0x200
+; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
+; TOSMEM: s_mov_b32 m0, vcc_hi
+
+; TOSMEM: s_mov_b64 exec,
+; TOSMEM: s_cbranch_execz
+; TOSMEM: s_branch
+
+; TOSMEM: BB{{[0-9]+_[0-9]+}}:
+; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100
+; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload
+
+
; GCN-NOT: v_readlane_b32 m0
+; GCN-NOT: s_buffer_store_dword m0
+; GCN-NOT: s_buffer_load_dword m0
define amdgpu_ps void @spill_m0_lds(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) #0 {
main_body:
%4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
@@ -71,6 +111,52 @@ endif:
ret void
}
+; GCN-LABEL: {{^}}restore_m0_lds:
+; TOSMEM: s_cmp_eq_u32
+; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_mov_b32 m0, s3
+; TOSMEM: s_buffer_store_dword s4, s[84:87], m0 ; 4-byte Folded Spill
+; TOSMEM: s_mov_b32 m0, vcc_hi
+; TOSMEM: s_cbranch_scc1
+
+; TOSMEM: s_mov_b32 m0, -1
+
+; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_mov_b32 m0, s3
+; TOSMEM: s_buffer_load_dword s4, s[84:87], m0 ; 4-byte Folded Reload
+; TOSMEM: s_add_u32 m0, s3, 0x100
+; TOSMEM: s_waitcnt lgkmcnt(0)
+; TOSMEM: s_buffer_load_dword s5, s[84:87], m0 ; 4-byte Folded Reload
+; TOSMEM: s_mov_b32 m0, vcc_hi
+; TOSMEM: s_waitcnt lgkmcnt(0)
+
+; TOSMEM: ds_write_b64
+
+; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_add_u32 m0, s3, 0x200
+; TOSMEM: s_buffer_load_dword s0, s[84:87], m0 ; 4-byte Folded Reload
+; TOSMEM: s_mov_b32 m0, vcc_hi
+; TOSMEM: s_waitcnt lgkmcnt(0)
+; TOSMEM: s_mov_b32 m0, s0
+; TOSMEM: ; use m0
+
+; TOSMEM: s_dcache_wb
+; TOSMEM: s_endpgm
+define void @restore_m0_lds(i32 %arg) {
+ %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
+ %sval = load volatile i64, i64 addrspace(2)* undef
+ %cmp = icmp eq i32 %arg, 0
+ br i1 %cmp, label %ret, label %bb
+
+bb:
+ store volatile i64 %sval, i64 addrspace(3)* undef
+ call void asm sideeffect "; use $0", "{M0}"(i32 %m0) #0
+ br label %ret
+
+ret:
+ ret void
+}
+
declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
declare i32 @llvm.SI.packf16(float, float) readnone
Modified: llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll Fri Nov 25 11:37:09 2016
@@ -20,8 +20,8 @@
; VI-DAG: s_mov_b32 s15, 0xe80000
; s11 is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024
Added: llvm/trunk/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir?rev=287942&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir (added)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/scalar-store-cache-flush.mir Fri Nov 25 11:37:09 2016
@@ -0,0 +1,173 @@
+# RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s
+
+--- |
+ define void @basic_insert_dcache_wb() {
+ ret void
+ }
+
+ define void @explicit_flush_after() {
+ ret void
+ }
+
+ define void @explicit_flush_before() {
+ ret void
+ }
+
+ define void @no_scalar_store() {
+ ret void
+ }
+
+ define void @multi_block_store() {
+ bb0:
+ br i1 undef, label %bb1, label %bb2
+
+ bb1:
+ ret void
+
+ bb2:
+ ret void
+ }
+
+ define void @one_block_store() {
+ bb0:
+ br i1 undef, label %bb1, label %bb2
+
+ bb1:
+ ret void
+
+ bb2:
+ ret void
+ }
+
+ define amdgpu_ps float @si_return() {
+ ret float undef
+ }
+
+...
+---
+# CHECK-LABEL: name: basic_insert_dcache_wb
+# CHECK: bb.0:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: basic_insert_dcache_wb
+tracksRegLiveness: false
+
+body: |
+ bb.0:
+ S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+ S_ENDPGM
+...
+---
+# Already has an explicitly requested flush after the last store.
+# CHECK-LABEL: name: explicit_flush_after
+# CHECK: bb.0:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: explicit_flush_after
+tracksRegLiveness: false
+
+body: |
+ bb.0:
+ S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+ S_DCACHE_WB
+ S_ENDPGM
+...
+---
+# Already has an explicitly requested flush before the last store.
+# CHECK-LABEL: name: explicit_flush_before
+# CHECK: bb.0:
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: explicit_flush_before
+tracksRegLiveness: false
+
+body: |
+ bb.0:
+ S_DCACHE_WB
+ S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+ S_ENDPGM
+...
+---
+# CHECK-LABEL: no_scalar_store
+# CHECK: bb.0
+# CHECK-NEXT: S_ENDPGM
+name: no_scalar_store
+tracksRegLiveness: false
+
+body: |
+ bb.0:
+ S_ENDPGM
+...
+
+# CHECK-LABEL: name: multi_block_store
+# CHECK: bb.0:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+# CHECK: bb.1:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: multi_block_store
+tracksRegLiveness: false
+
+body: |
+ bb.0:
+ S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+ S_ENDPGM
+
+ bb.1:
+ S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0
+ S_ENDPGM
+...
+...
+
+# This one should be able to omit the flush in the storeless block but
+# this isn't handled now.
+
+# CHECK-LABEL: name: one_block_store
+# CHECK: bb.0:
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+# CHECK: bb.1:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: S_ENDPGM
+
+name: one_block_store
+tracksRegLiveness: false
+
+body: |
+ bb.0:
+ S_ENDPGM
+
+ bb.1:
+ S_STORE_DWORD_SGPR undef %sgpr4, undef %sgpr6_sgpr7, undef %m0, 0
+ S_ENDPGM
+...
+---
+# CHECK-LABEL: name: si_return
+# CHECK: bb.0:
+# CHECK-NEXT: S_STORE_DWORD
+# CHECK-NEXT: S_WAITCNT
+# CHECK-NEXT: S_DCACHE_WB
+# CHECK-NEXT: SI_RETURN
+
+name: si_return
+tracksRegLiveness: false
+
+body: |
+ bb.0:
+ S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
+ SI_RETURN undef %vgpr0
+...
Modified: llvm/trunk/test/CodeGen/MIR/AMDGPU/si-fix-sgpr-copies.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/si-fix-sgpr-copies.mir?rev=287942&r1=287941&r2=287942&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/si-fix-sgpr-copies.mir (original)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/si-fix-sgpr-copies.mir Fri Nov 25 11:37:09 2016
@@ -6,14 +6,14 @@
name: phi_visit_order
tracksRegLiveness: true
registers:
- - { id: 0, class: sreg_32 }
+ - { id: 0, class: sreg_32_xm0 }
- { id: 1, class: sreg_64 }
- - { id: 2, class: sreg_32 }
+ - { id: 2, class: sreg_32_xm0 }
- { id: 7, class: vgpr_32 }
- - { id: 8, class: sreg_32 }
+ - { id: 8, class: sreg_32_xm0 }
- { id: 9, class: vgpr_32 }
- { id: 10, class: sreg_64 }
- - { id: 11, class: sreg_32 }
+ - { id: 11, class: sreg_32_xm0 }
body: |
; GCN-LABEL: name: phi_visit_order
More information about the llvm-commits
mailing list