[llvm] r373638 - AMDGPU/GlobalISel: Split 64-bit vector extracts during RegBankSelect
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 3 10:55:28 PDT 2019
Author: arsenm
Date: Thu Oct 3 10:55:27 2019
New Revision: 373638
URL: http://llvm.org/viewvc/llvm-project?rev=373638&view=rev
Log:
AMDGPU/GlobalISel: Split 64-bit vector extracts during RegBankSelect
Register indexing 64-bit elements is possible on the SALU, but not the
VALU. Handle splitting this into two 32-bit indexes. Extend waterfall
loop handling to allow moving a range of instructions.
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp?rev=373638&r1=373637&r2=373638&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp Thu Oct 3 10:55:27 2019
@@ -17,7 +17,6 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
@@ -659,43 +658,28 @@ static LLT getHalfSizedType(LLT Ty) {
/// unique values used.
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineIRBuilder &B,
- MachineInstr &MI,
- MachineRegisterInfo &MRI,
- ArrayRef<unsigned> OpIndices) const {
- MachineFunction *MF = &B.getMF();
- MachineBasicBlock::iterator I(MI);
-
- MachineBasicBlock &MBB = *MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
-
- // Use a set to avoid extra readfirstlanes in the case where multiple operands
- // are the same register.
- SmallSet<Register, 4> SGPROperandRegs;
- for (unsigned Op : OpIndices) {
- assert(MI.getOperand(Op).isUse());
- Register Reg = MI.getOperand(Op).getReg();
- const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
- if (OpBank->getID() == AMDGPU::VGPRRegBankID)
- SGPROperandRegs.insert(Reg);
- }
-
- // No operands need to be replaced, so no need to loop.
- if (SGPROperandRegs.empty())
- return false;
-
+ iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs,
+ MachineRegisterInfo &MRI) const {
SmallVector<Register, 4> ResultRegs;
SmallVector<Register, 4> InitResultRegs;
SmallVector<Register, 4> PhiRegs;
- for (MachineOperand &Def : MI.defs()) {
- LLT ResTy = MRI.getType(Def.getReg());
- const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
- ResultRegs.push_back(Def.getReg());
- Register InitReg = B.buildUndef(ResTy).getReg(0);
- Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
- InitResultRegs.push_back(InitReg);
- PhiRegs.push_back(PhiReg);
- MRI.setRegBank(PhiReg, *DefBank);
- MRI.setRegBank(InitReg, *DefBank);
+
+ MachineBasicBlock &MBB = B.getMBB();
+ MachineFunction *MF = &B.getMF();
+
+ for (MachineInstr &MI : Range) {
+ for (MachineOperand &Def : MI.defs()) {
+ LLT ResTy = MRI.getType(Def.getReg());
+ const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
+ ResultRegs.push_back(Def.getReg());
+ Register InitReg = B.buildUndef(ResTy).getReg(0);
+ Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
+ InitResultRegs.push_back(InitReg);
+ PhiRegs.push_back(PhiReg);
+ MRI.setRegBank(PhiReg, *DefBank);
+ MRI.setRegBank(InitReg, *DefBank);
+ }
}
Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
@@ -724,7 +708,7 @@ bool AMDGPURegisterBankInfo::executeInWa
// Move the rest of the block into a new block.
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
- RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
MBB.addSuccessor(LoopBB);
RestoreExecBB->addSuccessor(RemainderBB);
@@ -747,139 +731,56 @@ bool AMDGPURegisterBankInfo::executeInWa
.addMBB(LoopBB);
}
- // Move the instruction into the loop.
- LoopBB->splice(LoopBB->end(), &MBB, I);
- I = std::prev(LoopBB->end());
-
- B.setInstr(*I);
-
- Register CondReg;
-
- for (MachineOperand &Op : MI.uses()) {
- if (!Op.isReg())
- continue;
-
- assert(!Op.isDef());
- if (SGPROperandRegs.count(Op.getReg())) {
- LLT OpTy = MRI.getType(Op.getReg());
- unsigned OpSize = OpTy.getSizeInBits();
-
- // Can only do a readlane of 32-bit pieces.
- if (OpSize == 32) {
- // Avoid extra copies in the simple case of one 32-bit register.
- Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- MRI.setType(CurrentLaneOpReg, OpTy);
-
- constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
- .addReg(Op.getReg());
-
- Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- bool First = CondReg == AMDGPU::NoRegister;
- if (First)
- CondReg = NewCondReg;
-
- // Compare the just read M0 value to all possible Idx values.
- B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
- .addDef(NewCondReg)
- .addReg(CurrentLaneOpReg)
- .addReg(Op.getReg());
- Op.setReg(CurrentLaneOpReg);
-
- if (!First) {
- Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
-
- // If there are multiple operands to consider, and the conditions.
- B.buildInstr(AMDGPU::S_AND_B64)
- .addDef(AndReg)
- .addReg(NewCondReg)
- .addReg(CondReg);
- CondReg = AndReg;
- }
- } else {
- LLT S32 = LLT::scalar(32);
- SmallVector<Register, 8> ReadlanePieces;
-
- // The compares can be done as 64-bit, but the extract needs to be done
- // in 32-bit pieces.
-
- bool Is64 = OpSize % 64 == 0;
+ const DebugLoc &DL = B.getDL();
- LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
- unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
- : AMDGPU::V_CMP_EQ_U32_e64;
+ // Figure out the iterator range after splicing the instructions.
+ auto NewBegin = std::prev(LoopBB->end());
- // The compares can be done as 64-bit, but the extract needs to be done
- // in 32-bit pieces.
+ // Move the instruction into the loop. Note we moved everything after
+ // Range.end() already into a new block, so Range.end() is no longer valid.
+ LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
- // Insert the unmerge before the loop.
+ auto NewEnd = LoopBB->end();
- B.setMBB(MBB);
- auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
- B.setInstr(*I);
+ MachineBasicBlock::iterator I = Range.begin();
+ B.setInsertPt(*LoopBB, I);
- unsigned NumPieces = Unmerge->getNumOperands() - 1;
- for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
- Register UnmergePiece = Unmerge.getReg(PieceIdx);
-
- Register CurrentLaneOpReg;
- if (Is64) {
- Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
- Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
-
- MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
- MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
- MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
-
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegLo)
- .addReg(UnmergePiece, 0, AMDGPU::sub0);
-
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegHi)
- .addReg(UnmergePiece, 0, AMDGPU::sub1);
+ Register CondReg;
- CurrentLaneOpReg =
- B.buildMerge(LLT::scalar(64),
- {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
- .getReg(0);
+ for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+ for (MachineOperand &Op : MI.uses()) {
+ if (!Op.isReg() || Op.isDef())
+ continue;
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
-
- if (OpTy.getScalarSizeInBits() == 64) {
- // If we need to produce a 64-bit element vector, so use the
- // merged pieces
- ReadlanePieces.push_back(CurrentLaneOpReg);
- } else {
- // 32-bit element type.
- ReadlanePieces.push_back(CurrentLaneOpRegLo);
- ReadlanePieces.push_back(CurrentLaneOpRegHi);
- }
- } else {
- CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
- MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
-
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpReg)
- .addReg(UnmergePiece);
- ReadlanePieces.push_back(CurrentLaneOpReg);
- }
+ if (SGPROperandRegs.count(Op.getReg())) {
+ LLT OpTy = MRI.getType(Op.getReg());
+ unsigned OpSize = OpTy.getSizeInBits();
+
+ // Can only do a readlane of 32-bit pieces.
+ if (OpSize == 32) {
+ // Avoid extra copies in the simple case of one 32-bit register.
+ Register CurrentLaneOpReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ MRI.setType(CurrentLaneOpReg, OpTy);
+
+ constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(Op.getReg());
Register NewCondReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
bool First = CondReg == AMDGPU::NoRegister;
if (First)
CondReg = NewCondReg;
- B.buildInstr(CmpOp)
+ // Compare the just read M0 value to all possible Idx values.
+ B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
.addDef(NewCondReg)
.addReg(CurrentLaneOpReg)
- .addReg(UnmergePiece);
+ .addReg(Op.getReg());
+ Op.setReg(CurrentLaneOpReg);
if (!First) {
Register AndReg
@@ -892,19 +793,115 @@ bool AMDGPURegisterBankInfo::executeInWa
.addReg(CondReg);
CondReg = AndReg;
}
- }
-
- // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
- // BUILD_VECTOR
- if (OpTy.isVector()) {
- auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
} else {
- auto Merge = B.buildMerge(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
- }
+ LLT S32 = LLT::scalar(32);
+ SmallVector<Register, 8> ReadlanePieces;
+
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
+
+ bool Is64 = OpSize % 64 == 0;
+
+ LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
+ unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
+ : AMDGPU::V_CMP_EQ_U32_e64;
+
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
+
+ // Insert the unmerge before the loop.
+
+ B.setMBB(MBB);
+ auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+ B.setInstr(*I);
+
+ unsigned NumPieces = Unmerge->getNumOperands() - 1;
+ for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
+ Register UnmergePiece = Unmerge.getReg(PieceIdx);
+
+ Register CurrentLaneOpReg;
+ if (Is64) {
+ Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
+ Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegLo)
+ .addReg(UnmergePiece, 0, AMDGPU::sub0);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegHi)
+ .addReg(UnmergePiece, 0, AMDGPU::sub1);
+
+ CurrentLaneOpReg =
+ B.buildMerge(LLT::scalar(64),
+ {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
+ .getReg(0);
+
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
- MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
+ if (OpTy.getScalarSizeInBits() == 64) {
+ // If we need to produce a 64-bit element vector, so use the
+ // merged pieces
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ } else {
+ // 32-bit element type.
+ ReadlanePieces.push_back(CurrentLaneOpRegLo);
+ ReadlanePieces.push_back(CurrentLaneOpRegHi);
+ }
+ } else {
+ CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(UnmergePiece);
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ }
+
+ Register NewCondReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
+
+ B.buildInstr(CmpOp)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(UnmergePiece);
+
+ if (!First) {
+ Register AndReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(AMDGPU::S_AND_B64)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ }
+
+ // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
+ // BUILD_VECTOR
+ if (OpTy.isVector()) {
+ auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
+ } else {
+ auto Merge = B.buildMerge(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
+ }
+
+ MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
+ }
}
}
}
@@ -947,6 +944,40 @@ bool AMDGPURegisterBankInfo::executeInWa
return true;
}
+// Return any unique registers used by \p MI at \p OpIndices that need to be
+// handled in a waterfall loop. Returns these registers in \p
+// SGPROperandRegs. Returns true if there are any operansd to handle and a
+// waterfall loop is necessary.
+bool AMDGPURegisterBankInfo::collectWaterfallOperands(
+ SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
+ MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
+ for (unsigned Op : OpIndices) {
+ assert(MI.getOperand(Op).isUse());
+ Register Reg = MI.getOperand(Op).getReg();
+ const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
+ if (OpBank->getID() == AMDGPU::VGPRRegBankID)
+ SGPROperandRegs.insert(Reg);
+ }
+
+ // No operands need to be replaced, so no need to loop.
+ return !SGPROperandRegs.empty();
+}
+
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+ MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const {
+ // Use a set to avoid extra readfirstlanes in the case where multiple operands
+ // are the same register.
+ SmallSet<Register, 4> SGPROperandRegs;
+
+ if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
+ return false;
+
+ MachineBasicBlock::iterator I = MI.getIterator();
+ return executeInWaterfallLoop(B, make_range(I, std::next(I)),
+ SGPROperandRegs, MRI);
+}
+
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineInstr &MI, MachineRegisterInfo &MRI,
ArrayRef<unsigned> OpIndices) const {
@@ -1602,10 +1633,69 @@ void AMDGPURegisterBankInfo::applyMappin
MI.eraseFromParent();
return;
}
- case AMDGPU::G_EXTRACT_VECTOR_ELT:
- applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, { 2 });
+ case AMDGPU::G_EXTRACT_VECTOR_ELT: {
+ SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
+
+ assert(empty(OpdMapper.getVRegs(1)) && empty(OpdMapper.getVRegs(2)));
+
+ if (DstRegs.empty()) {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, { 2 });
+ return;
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register IdxReg = MI.getOperand(2).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+
+ assert(DstTy.getSizeInBits() == 64);
+
+ LLT SrcTy = MRI.getType(SrcReg);
+ const LLT S32 = LLT::scalar(32);
+ LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+
+ MachineIRBuilder B(MI);
+ auto CastSrc = B.buildBitcast(Vec32, SrcReg);
+ auto One = B.buildConstant(S32, 1);
+
+ // Split the vector index into 32-bit pieces. Prepare to move all of the
+ // new instructions into a waterfall loop if necessary.
+ //
+ // Don't put the bitcast or constant in the loop.
+ MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
+
+ // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
+ auto IdxLo = B.buildShl(S32, IdxReg, One);
+ auto IdxHi = B.buildAdd(S32, IdxLo, One);
+ B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
+ B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
+
+ const ValueMapping &DstMapping
+ = OpdMapper.getInstrMapping().getOperandMapping(0);
+
+ // FIXME: Should be getting from mapping or not?
+ const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+ MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank);
+ MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
+ MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
+
+ SmallSet<Register, 4> OpsToWaterfall;
+ if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Remove the original instruction to avoid potentially confusing the
+ // waterfall loop logic.
+ B.setInstr(*Span.begin());
+ MI.eraseFromParent();
+ executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
+ OpsToWaterfall, MRI);
return;
+ }
case AMDGPU::G_INTRINSIC: {
switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_s_buffer_load: {
@@ -2317,7 +2407,7 @@ AMDGPURegisterBankInfo::getInstrMapping(
unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
- OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, DstSize);
+ OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
// The index can be either if the source vector is VGPR.
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h?rev=373638&r1=373637&r2=373638&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h Thu Oct 3 10:55:27 2019
@@ -13,6 +13,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -42,6 +44,18 @@ class AMDGPURegisterBankInfo : public AM
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
+ bool collectWaterfallOperands(
+ SmallSet<Register, 4> &SGPROperandRegs,
+ MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const;
+
+ bool executeInWaterfallLoop(
+ MachineIRBuilder &B,
+ iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs,
+ MachineRegisterInfo &MRI) const;
+
bool executeInWaterfallLoop(MachineIRBuilder &B,
MachineInstr &MI,
MachineRegisterInfo &MRI,
Modified: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir?rev=373638&r1=373637&r2=373638&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir Thu Oct 3 10:55:27 2019
@@ -117,6 +117,102 @@ body: |
...
---
+name: extract_vector_elt_v8s64_ss
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16
+ ; CHECK-LABEL: name: extract_vector_elt_v8s64_ss
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16
+ ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr16
+ ; CHECK: [[EVEC:%[0-9]+]]:sgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[COPY1]](s32)
+ ; CHECK: $sgpr0_sgpr1 = COPY [[EVEC]](s64)
+ %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ %1:_(s32) = COPY $sgpr16
+ %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1
+ $sgpr0_sgpr1 = COPY %2
+...
+
+---
+name: extract_vector_elt_v8s64_vs
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0
+
+ ; CHECK-LABEL: name: extract_vector_elt_v8s64_vs
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0
+ ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+ ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY1]], [[C]](s32)
+ ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
+ ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
+ ; CHECK: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
+ ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32)
+ ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ %1:_(s32) = COPY $sgpr0
+ %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1
+ $vgpr0_vgpr1 = COPY %2
+...
+
+---
+name: extract_vector_elt_v8s64_sv
+legalized: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
+ ; CHECK-LABEL: name: extract_vector_elt_v8s64_sv
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
+ ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+ ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
+ ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
+ ; CHECK: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
+ ; CHECK: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
+ ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
+ ; CHECK: .1:
+ ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF4]], %bb.0, %20, %bb.1
+ ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1
+ ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1
+ ; CHECK: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1
+ ; CHECK: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1
+ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
+ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
+ ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
+ ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
+ ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
+ ; CHECK: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
+ ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+ ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
+ ; CHECK: .2:
+ ; CHECK: successors: %bb.3(0x80000000)
+ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
+ ; CHECK: .3:
+ ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32)
+ ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ %1:_(s32) = COPY $vgpr0
+ %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1
+ $vgpr0_vgpr1 = COPY %2
+...
+
+---
name: extract_vector_elt_v8s64_vv
legalized: true
tracksRegLiveness: true
@@ -129,16 +225,27 @@ body: |
; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
- ; CHECK: [[DEF:%[0-9]+]]:vgpr(s64) = G_IMPLICIT_DEF
- ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+ ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
+ ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
+ ; CHECK: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
+ ; CHECK: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
+ ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
; CHECK: .1:
; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000)
- ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1
- ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s64) = G_PHI [[DEF]](s64), %bb.0, %2(s64), %bb.1
+ ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF4]], %bb.0, %20, %bb.1
+ ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1
+ ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1
+ ; CHECK: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1
+ ; CHECK: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1
; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
- ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[V_READFIRSTLANE_B32_]](s32)
+ ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
+ ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
+ ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
+ ; CHECK: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@@ -146,7 +253,8 @@ body: |
; CHECK: successors: %bb.3(0x80000000)
; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
; CHECK: .3:
- ; CHECK: $vgpr0_vgpr1 = COPY [[EVEC]](s64)
+ ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32)
+ ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
%0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
%1:_(s32) = COPY $vgpr16
%2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1
More information about the llvm-commits
mailing list