[llvm] r283177 - AMDGPU: Refactor indirect vector lowering
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 3 18:41:06 PDT 2016
Author: arsenm
Date: Mon Oct 3 20:41:05 2016
New Revision: 283177
URL: http://llvm.org/viewvc/llvm-project?rev=283177&view=rev
Log:
AMDGPU: Refactor indirect vector lowering
Allow inserting multiple instructions in the
expanded loop.
Modified:
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=283177&r1=283176&r2=283177&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Mon Oct 3 20:41:05 2016
@@ -1118,18 +1118,18 @@ MachineBasicBlock *SITargetLowering::spl
// will only do one iteration. In the worst case, this will loop 64 times.
//
// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
-static void emitLoadM0FromVGPRLoop(const SIInstrInfo *TII,
- MachineRegisterInfo &MRI,
- MachineBasicBlock &OrigBB,
- MachineBasicBlock &LoopBB,
- const DebugLoc &DL,
- MachineInstr *MovRel,
- const MachineOperand &IdxReg,
- unsigned InitReg,
- unsigned ResultReg,
- unsigned PhiReg,
- unsigned InitSaveExecReg,
- int Offset) {
+static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
+ const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI,
+ MachineBasicBlock &OrigBB,
+ MachineBasicBlock &LoopBB,
+ const DebugLoc &DL,
+ const MachineOperand &IdxReg,
+ unsigned InitReg,
+ unsigned ResultReg,
+ unsigned PhiReg,
+ unsigned InitSaveExecReg,
+ int Offset) {
MachineBasicBlock::iterator I = LoopBB.begin();
unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -1174,11 +1174,9 @@ static void emitLoadM0FromVGPRLoop(const
MRI.setSimpleHint(NewExec, CondReg);
- // Do the actual move.
- LoopBB.insert(I, MovRel);
-
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ MachineInstr *InsertPt =
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(NewExec);
@@ -1188,6 +1186,8 @@ static void emitLoadM0FromVGPRLoop(const
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addMBB(&LoopBB);
+
+ return InsertPt->getIterator();
}
// This has slightly sub-optimal regalloc when the source vector is killed by
@@ -1195,13 +1195,12 @@ static void emitLoadM0FromVGPRLoop(const
// per-workitem, so is kept alive for the whole loop so we end up not re-using a
// subregister from it, using 1 more VGPR than necessary. This was saved when
// this was expanded after register allocation.
-static MachineBasicBlock *loadM0FromVGPR(const SIInstrInfo *TII,
- MachineBasicBlock &MBB,
- MachineInstr &MI,
- MachineInstr *MovRel,
- unsigned InitResultReg,
- unsigned PhiReg,
- int Offset) {
+static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
+ MachineBasicBlock &MBB,
+ MachineInstr &MI,
+ unsigned InitResultReg,
+ unsigned PhiReg,
+ int Offset) {
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
@@ -1238,8 +1237,9 @@ static MachineBasicBlock *loadM0FromVGPR
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, MovRel, *Idx,
- InitResultReg, DstReg, PhiReg, TmpExec, Offset);
+ auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
+ InitResultReg, DstReg, PhiReg, TmpExec,
+ Offset);
MachineBasicBlock::iterator First = RemainderBB->begin();
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -1247,7 +1247,7 @@ static MachineBasicBlock *loadM0FromVGPR
MI.eraseFromParent();
- return RemainderBB;
+ return InsPt;
}
// Returns subreg index, offset
@@ -1298,7 +1298,8 @@ static bool setM0ToIndexFromSGPR(const S
// Control flow needs to be inserted if indexing with a VGPR.
static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineBasicBlock &MBB,
- const SIInstrInfo *TII) {
+ const SISubtarget &ST) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -1333,17 +1334,21 @@ static MachineBasicBlock *emitIndirectSr
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
- MachineInstr *MovRel =
- BuildMI(*MF, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset);
+
+ BuildMI(*InsPt->getParent(), InsPt, DL,
+ TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(SrcVec->getReg(), RegState::Undef, SubReg)
.addReg(SrcVec->getReg(), RegState::Implicit);
- return loadM0FromVGPR(TII, MBB, MI, MovRel, InitReg, PhiReg, Offset);
+ return InsPt->getParent();
}
static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
MachineBasicBlock &MBB,
- const SIInstrInfo *TII) {
+ const SISubtarget &ST) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -1404,9 +1409,11 @@ static MachineBasicBlock *emitIndirectDs
const DebugLoc &DL = MI.getDebugLoc();
unsigned PhiReg = MRI.createVirtualRegister(VecRC);
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset);
+
// vdst is not actually read and just provides the base register index.
MachineInstr *MovRel =
- BuildMI(*MF, DL, MovRelDesc)
+ BuildMI(*InsPt->getParent(), InsPt, DL, MovRelDesc)
.addReg(PhiReg, RegState::Undef, SubReg) // vdst
.addOperand(*Val)
.addReg(Dst, RegState::ImplicitDefine)
@@ -1418,8 +1425,7 @@ static MachineBasicBlock *emitIndirectDs
MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
- return loadM0FromVGPR(TII, MBB, MI, MovRel,
- SrcVec->getReg(), PhiReg, Offset);
+ return InsPt->getParent();
}
MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
@@ -1450,13 +1456,13 @@ MachineBasicBlock *SITargetLowering::Emi
case AMDGPU::SI_INDIRECT_SRC_V4:
case AMDGPU::SI_INDIRECT_SRC_V8:
case AMDGPU::SI_INDIRECT_SRC_V16:
- return emitIndirectSrc(MI, *BB, getSubtarget()->getInstrInfo());
+ return emitIndirectSrc(MI, *BB, *getSubtarget());
case AMDGPU::SI_INDIRECT_DST_V1:
case AMDGPU::SI_INDIRECT_DST_V2:
case AMDGPU::SI_INDIRECT_DST_V4:
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V16:
- return emitIndirectDst(MI, *BB, getSubtarget()->getInstrInfo());
+ return emitIndirectDst(MI, *BB, *getSubtarget());
case AMDGPU::SI_KILL:
return splitKillBlock(MI, BB);
case AMDGPU::V_CNDMASK_B64_PSEUDO: {
More information about the llvm-commits
mailing list