[llvm] c3bc805 - AMDGPU/SILoadStoreOptimillzer: Refactor CombineInfo struct
Tom Stellard via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 13:43:29 PST 2019
Author: Tom Stellard
Date: 2019-12-17T13:43:10-08:00
New Revision: c3bc805f4f99d7a75d0d45ce97ade6ade554e505
URL: https://github.com/llvm/llvm-project/commit/c3bc805f4f99d7a75d0d45ce97ade6ade554e505
DIFF: https://github.com/llvm/llvm-project/commit/c3bc805f4f99d7a75d0d45ce97ade6ade554e505.diff
LOG: AMDGPU/SILoadStoreOptimillzer: Refactor CombineInfo struct
Summary:
Modify CombineInfo to only store information about a single instruction.
This is a little easier to work with and removes a lot of duplicate
initialization code.
Reviewers: arsenm, nhaehnle
Reviewed By: arsenm, nhaehnle
Subscribers: merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D71045
Added:
Modified:
llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index d584b868bb5c..d2b1abc8a9fb 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -115,24 +115,16 @@ enum RegisterEnum {
class SILoadStoreOptimizer : public MachineFunctionPass {
struct CombineInfo {
MachineBasicBlock::iterator I;
- MachineBasicBlock::iterator Paired;
unsigned EltSize;
- unsigned Offset0;
- unsigned Offset1;
- unsigned Width0;
- unsigned Width1;
- unsigned Format0;
- unsigned Format1;
+ unsigned Offset;
+ unsigned Width;
+ unsigned Format;
unsigned BaseOff;
- unsigned DMask0;
- unsigned DMask1;
+ unsigned DMask;
InstClassEnum InstClass;
- bool GLC0;
- bool GLC1;
- bool SLC0;
- bool SLC1;
- bool DLC0;
- bool DLC1;
+ bool GLC;
+ bool SLC;
+ bool DLC;
bool UseST64;
SmallVector<MachineInstr *, 8> InstsToMove;
int AddrIdx[5];
@@ -188,7 +180,6 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
const GCNSubtarget &STM);
- void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
};
struct BaseRegisters {
@@ -216,28 +207,33 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
bool OptimizeAgain;
static bool dmasksCanBeCombined(const CombineInfo &CI,
- const SIInstrInfo &TII);
- static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI);
- static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
- static unsigned getNewOpcode(const CombineInfo &CI);
- static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
- const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
-
- bool findMatchingInst(CombineInfo &CI);
+ const SIInstrInfo &TII,
+ const CombineInfo &Paired);
+ static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI,
+ CombineInfo &Paired);
+ static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI,
+ const CombineInfo &Paired);
+ static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
+ static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
+ const CombineInfo &Paired);
+ const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
+ const CombineInfo &Paired);
+
+ bool findMatchingInst(CombineInfo &CI, CombineInfo &Paired);
unsigned read2Opcode(unsigned EltSize) const;
unsigned read2ST64Opcode(unsigned EltSize) const;
- MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired);
unsigned write2Opcode(unsigned EltSize) const;
unsigned write2ST64Opcode(unsigned EltSize) const;
- MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
- MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI);
- MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
- MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
- MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
- MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI);
- MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired);
+ MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI, CombineInfo &Paired);
+ MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired);
+ MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired);
+ MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired);
+ MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired);
+ MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired);
void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
int32_t NewOffset) const;
@@ -498,25 +494,25 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
}
if (InstClass == MIMG) {
- DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
+ DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
} else {
int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
- Offset0 = I->getOperand(OffsetIdx).getImm();
+ Offset = I->getOperand(OffsetIdx).getImm();
}
if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
- Format0 = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
+ Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
- Width0 = getOpcodeWidth(*I, TII);
+ Width = getOpcodeWidth(*I, TII);
if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
- Offset0 &= 0xffff;
+ Offset &= 0xffff;
} else if (InstClass != MIMG) {
- GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
+ GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
if (InstClass != S_BUFFER_LOAD_IMM) {
- SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
+ SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
}
- DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
+ DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
}
unsigned AddrOpName[5] = {0};
@@ -555,34 +551,6 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
InstsToMove.clear();
}
-void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
- const SIInstrInfo &TII) {
- Paired = MI;
- assert(InstClass == getInstClass(Paired->getOpcode(), TII));
-
- if (InstClass == MIMG) {
- DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm();
- } else {
- int OffsetIdx =
- AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
- Offset1 = Paired->getOperand(OffsetIdx).getImm();
- }
-
- if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
- Format1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::format)->getImm();
-
- Width1 = getOpcodeWidth(*Paired, TII);
- if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
- Offset1 &= 0xffff;
- } else if (InstClass != MIMG) {
- GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm();
- if (InstClass != S_BUFFER_LOAD_IMM) {
- SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm();
- }
- DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm();
- }
-}
-
} // end anonymous namespace.
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
@@ -688,7 +656,9 @@ static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
return MMO;
}
-bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) {
+bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
+ const SIInstrInfo &TII,
+ const CombineInfo &Paired) {
assert(CI.InstClass == MIMG);
// Ignore instructions with tfe/lwe set.
@@ -705,16 +675,16 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIIn
for (auto op : OperandsToMatch) {
int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
- if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx)
+ if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
return false;
if (Idx != -1 &&
- CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm())
+ CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
return false;
}
// Check DMask for overlaps.
- unsigned MaxMask = std::max(CI.DMask0, CI.DMask1);
- unsigned MinMask = std::min(CI.DMask0, CI.DMask1);
+ unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
+ unsigned MinMask = std::min(CI.DMask, Paired.DMask);
unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
if ((1u << AllowedBitsForMin) <= MinMask)
@@ -749,26 +719,27 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
}
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
- const MCSubtargetInfo &STI) {
+ const MCSubtargetInfo &STI,
+ CombineInfo &Paired) {
assert(CI.InstClass != MIMG);
// XXX - Would the same offset be OK? Is there any reason this would happen or
// be useful?
- if (CI.Offset0 == CI.Offset1)
+ if (CI.Offset == Paired.Offset)
return false;
// This won't be valid if the offset isn't aligned.
- if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
+ if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
return false;
if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
- llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format0, STI);
+ llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
if (!Info0)
return false;
const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
- llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format1, STI);
+ llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
if (!Info1)
return false;
@@ -781,54 +752,54 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
if (Info0->BitsPerComp != 32)
return false;
- if (getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, STI) == 0)
+ if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
return false;
}
- unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
- unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
+ unsigned EltOffset0 = CI.Offset / CI.EltSize;
+ unsigned EltOffset1 = Paired.Offset / CI.EltSize;
CI.UseST64 = false;
CI.BaseOff = 0;
// Handle SMEM and VMEM instructions.
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
- return (EltOffset0 + CI.Width0 == EltOffset1 ||
- EltOffset1 + CI.Width1 == EltOffset0) &&
- CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
- (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
+ return (EltOffset0 + CI.Width == EltOffset1 ||
+ EltOffset1 + Paired.Width == EltOffset0) &&
+ CI.GLC == Paired.GLC && CI.DLC == Paired.DLC &&
+ (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
}
// If the offset in elements doesn't fit in 8-bits, we might be able to use
// the stride 64 versions.
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
- CI.Offset0 = EltOffset0 / 64;
- CI.Offset1 = EltOffset1 / 64;
+ CI.Offset = EltOffset0 / 64;
+ Paired.Offset = EltOffset1 / 64;
CI.UseST64 = true;
return true;
}
// Check if the new offsets fit in the reduced 8-bit range.
if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
- CI.Offset0 = EltOffset0;
- CI.Offset1 = EltOffset1;
+ CI.Offset = EltOffset0;
+ Paired.Offset = EltOffset1;
return true;
}
// Try to shift base address to decrease offsets.
unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
- CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
+ CI.BaseOff = std::min(CI.Offset, Paired.Offset);
if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
- CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
- CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+ CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
+ Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
CI.UseST64 = true;
return true;
}
if (isUInt<8>(OffsetDiff)) {
- CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
- CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
+ CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
+ Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
return true;
}
@@ -836,8 +807,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
}
bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
- const CombineInfo &CI) {
- const unsigned Width = (CI.Width0 + CI.Width1);
+ const CombineInfo &CI,
+ const CombineInfo &Paired) {
+ const unsigned Width = (CI.Width + Paired.Width);
switch (CI.InstClass) {
default:
return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
@@ -852,7 +824,8 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
}
}
-bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
+bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
+ CombineInfo &Paired) {
MachineBasicBlock *MBB = CI.I->getParent();
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = CI.I;
@@ -934,14 +907,14 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
bool Match = CI.hasSameBaseAddress(*MBBI);
if (Match) {
- CI.setPaired(MBBI, *TII);
+ Paired.setMI(MBBI, *TII, *STM);
// Check both offsets (or masks for MIMG) can be combined and fit in the
// reduced range.
bool canBeCombined =
CI.InstClass == MIMG
- ? dmasksCanBeCombined(CI, *TII)
- : widthsFit(*STM, CI) && offsetsCanBeCombined(CI, *STI);
+ ? dmasksCanBeCombined(CI, *TII, Paired)
+ : widthsFit(*STM, CI, Paired) && offsetsCanBeCombined(CI, *STI, Paired);
// We also need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
@@ -977,7 +950,7 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@@ -985,10 +958,10 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
- const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
+ const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
- unsigned NewOffset0 = CI.Offset0;
- unsigned NewOffset1 = CI.Offset1;
+ unsigned NewOffset0 = CI.Offset;
+ unsigned NewOffset1 = Paired.Offset;
unsigned Opc =
CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
@@ -1017,13 +990,13 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+ TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
.addReg(ImmReg)
.addReg(AddrReg->getReg(), 0, BaseSubReg)
.addImm(0); // clamp bit
@@ -1031,29 +1004,29 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
}
MachineInstrBuilder Read2 =
- BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
+ BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
.addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
.addImm(0) // gds
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+ .cloneMergedMemRefs({&*CI.I, &*Paired.I});
(void)Read2;
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
// Copy to the old destination registers.
- BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
moveInstsAfter(Copy1, CI.InstsToMove);
CI.I->eraseFromParent();
- CI.Paired->eraseFromParent();
+ Paired.I->eraseFromParent();
LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
return Read2;
@@ -1076,7 +1049,7 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -1086,10 +1059,10 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
const MachineOperand *Data0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
const MachineOperand *Data1 =
- TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
+ TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
- unsigned NewOffset0 = CI.Offset0;
- unsigned NewOffset1 = CI.Offset1;
+ unsigned NewOffset0 = CI.Offset;
+ unsigned NewOffset1 = Paired.Offset;
unsigned Opc =
CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
@@ -1110,13 +1083,13 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+ BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
- TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
+ TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
.addReg(ImmReg)
.addReg(AddrReg->getReg(), 0, BaseSubReg)
.addImm(0); // clamp bit
@@ -1124,38 +1097,38 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
}
MachineInstrBuilder Write2 =
- BuildMI(*MBB, CI.Paired, DL, Write2Desc)
+ BuildMI(*MBB, Paired.I, DL, Write2Desc)
.addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
.add(*Data0) // data0
.add(*Data1) // data1
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
.addImm(0) // gds
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+ .cloneMergedMemRefs({&*CI.I, &*Paired.I});
moveInstsAfter(Write2, CI.InstsToMove);
CI.I->eraseFromParent();
- CI.Paired->eraseFromParent();
+ Paired.I->eraseFromParent();
LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
return Write2;
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- const unsigned Opcode = getNewOpcode(CI);
+ const unsigned Opcode = getNewOpcode(CI, Paired);
- const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);
- unsigned MergedDMask = CI.DMask0 | CI.DMask1;
+ unsigned MergedDMask = CI.DMask | Paired.DMask;
unsigned DMaskIdx =
AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
- auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
+ auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
if (I == DMaskIdx)
MIB.addImm(MergedDMask);
@@ -1166,100 +1139,99 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) {
// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
- assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+ assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+ const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
- std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
- const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
- const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+ unsigned SubRegIdx0, SubRegIdx1;
+ std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
- const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+ const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
moveInstsAfter(Copy1, CI.InstsToMove);
CI.I->eraseFromParent();
- CI.Paired->eraseFromParent();
+ Paired.I->eraseFromParent();
return New;
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- const unsigned Opcode = getNewOpcode(CI);
+ const unsigned Opcode = getNewOpcode(CI, Paired);
- const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);
- unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+ unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
- assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+ assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+ const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
MachineInstr *New =
- BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+ BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
.addImm(MergedOffset) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.DLC0) // dlc
+ .addImm(CI.GLC) // glc
+ .addImm(CI.DLC) // dlc
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
- std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
- const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
+ const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
- BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
moveInstsAfter(Copy1, CI.InstsToMove);
CI.I->eraseFromParent();
- CI.Paired->eraseFromParent();
+ Paired.I->eraseFromParent();
return New;
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- const unsigned Opcode = getNewOpcode(CI);
+ const unsigned Opcode = getNewOpcode(CI, Paired);
- const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
// Copy to the new source register.
Register DestReg = MRI->createVirtualRegister(SuperRC);
- unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+ unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
- auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
+ auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
const unsigned Regs = getRegs(Opcode, *TII);
@@ -1269,59 +1241,59 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
- assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+ assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+ const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(MergedOffset) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
+ .addImm(CI.GLC) // glc
+ .addImm(CI.SLC) // slc
.addImm(0) // tfe
- .addImm(CI.DLC0) // dlc
+ .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
- std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
- const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+ const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
moveInstsAfter(Copy1, CI.InstsToMove);
CI.I->eraseFromParent();
- CI.Paired->eraseFromParent();
+ Paired.I->eraseFromParent();
return New;
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- const unsigned Opcode = getNewOpcode(CI);
+ const unsigned Opcode = getNewOpcode(CI, Paired);
- const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
// Copy to the new source register.
Register DestReg = MRI->createVirtualRegister(SuperRC);
- unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+ unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
- auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
+ auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
const unsigned Regs = getRegs(Opcode, *TII);
@@ -1329,77 +1301,77 @@ SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI) {
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
unsigned JoinedFormat =
- getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, *STI);
+ getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI);
// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
- assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+ assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+ const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(MergedOffset) // offset
.addImm(JoinedFormat) // format
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
+ .addImm(CI.GLC) // glc
+ .addImm(CI.SLC) // slc
.addImm(0) // tfe
- .addImm(CI.DLC0) // dlc
+ .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
- std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
- const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+ const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
moveInstsAfter(Copy1, CI.InstsToMove);
CI.I->eraseFromParent();
- CI.Paired->eraseFromParent();
+ Paired.I->eraseFromParent();
return New;
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- const unsigned Opcode = getNewOpcode(CI);
+ const unsigned Opcode = getNewOpcode(CI, Paired);
- std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the new source register.
- const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register SrcReg = MRI->createVirtualRegister(SuperRC);
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
- const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+ const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
.add(*Src0)
.addImm(SubRegIdx0)
.add(*Src1)
.addImm(SubRegIdx1);
- auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
+ auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
const unsigned Regs = getRegs(Opcode, *TII);
@@ -1408,25 +1380,25 @@ SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI) {
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
unsigned JoinedFormat =
- getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, *STI);
+ getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI);
// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
- assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+ assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+ const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
- .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
+ .addImm(std::min(CI.Offset, Paired.Offset)) // offset
.addImm(JoinedFormat) // format
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
+ .addImm(CI.GLC) // glc
+ .addImm(CI.SLC) // slc
.addImm(0) // tfe
- .addImm(CI.DLC0) // dlc
+ .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1434,12 +1406,13 @@ SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI) {
moveInstsAfter(MIB, CI.InstsToMove);
CI.I->eraseFromParent();
- CI.Paired->eraseFromParent();
+ Paired.I->eraseFromParent();
return New;
}
-unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
- const unsigned Width = CI.Width0 + CI.Width1;
+unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
+ const CombineInfo &Paired) {
+ const unsigned Width = CI.Width + Paired.Width;
switch (CI.InstClass) {
default:
@@ -1464,24 +1437,24 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
}
case MIMG:
- assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width));
+ assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
}
}
std::pair<unsigned, unsigned>
-SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
+SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
- if (CI.Width0 == 0 || CI.Width1 == 0 || CI.Width0 + CI.Width1 > 4)
+ if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
return std::make_pair(0, 0);
bool ReverseOrder;
if (CI.InstClass == MIMG) {
- assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) &&
+ assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
"No overlaps");
- ReverseOrder = CI.DMask0 > CI.DMask1;
+ ReverseOrder = CI.DMask > Paired.DMask;
} else
- ReverseOrder = CI.Offset0 > CI.Offset1;
+ ReverseOrder = CI.Offset > Paired.Offset;
static const unsigned Idxs[4][4] = {
{AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
@@ -1492,24 +1465,25 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
unsigned Idx0;
unsigned Idx1;
- assert(CI.Width0 >= 1 && CI.Width0 <= 3);
- assert(CI.Width1 >= 1 && CI.Width1 <= 3);
+ assert(CI.Width >= 1 && CI.Width <= 3);
+ assert(Paired.Width >= 1 && Paired.Width <= 3);
if (ReverseOrder) {
- Idx1 = Idxs[0][CI.Width1 - 1];
- Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
+ Idx1 = Idxs[0][Paired.Width - 1];
+ Idx0 = Idxs[Paired.Width][CI.Width - 1];
} else {
- Idx0 = Idxs[0][CI.Width0 - 1];
- Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
+ Idx0 = Idxs[0][CI.Width - 1];
+ Idx1 = Idxs[CI.Width][Paired.Width - 1];
}
return std::make_pair(Idx0, Idx1);
}
const TargetRegisterClass *
-SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
+SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
+ const CombineInfo &Paired) {
if (CI.InstClass == S_BUFFER_LOAD_IMM) {
- switch (CI.Width0 + CI.Width1) {
+ switch (CI.Width + Paired.Width) {
default:
return nullptr;
case 2:
@@ -1522,7 +1496,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
return &AMDGPU::SReg_512RegClass;
}
} else {
- switch (CI.Width0 + CI.Width1) {
+ switch (CI.Width + Paired.Width) {
default:
return nullptr;
case 2:
@@ -1536,30 +1510,30 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- const unsigned Opcode = getNewOpcode(CI);
+ const unsigned Opcode = getNewOpcode(CI, Paired);
- std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the new source register.
- const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register SrcReg = MRI->createVirtualRegister(SuperRC);
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
- const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+ const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
- BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
.add(*Src0)
.addImm(SubRegIdx0)
.add(*Src1)
.addImm(SubRegIdx1);
- auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
+ auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
const unsigned Regs = getRegs(Opcode, *TII);
@@ -1571,26 +1545,26 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
- assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+ assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
- const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+ const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
MachineInstr *New =
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
- .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
+ .addImm(std::min(CI.Offset, Paired.Offset)) // offset
+ .addImm(CI.GLC) // glc
+ .addImm(CI.SLC) // slc
.addImm(0) // tfe
- .addImm(CI.DLC0) // dlc
+ .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
moveInstsAfter(MIB, CI.InstsToMove);
CI.I->eraseFromParent();
- CI.Paired->eraseFromParent();
+ Paired.I->eraseFromParent();
return New;
}
@@ -2005,64 +1979,65 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
bool Modified = false;
for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
CombineInfo &CI = *I;
+ CombineInfo Paired;
if (CI.InstClass == UNKNOWN)
continue;
- if (!findMatchingInst(CI))
+ if (!findMatchingInst(CI, Paired))
goto done;
Modified = true;
- removeCombinedInst(MergeList, *CI.Paired);
+ removeCombinedInst(MergeList, *Paired.I);
switch (CI.InstClass) {
default:
llvm_unreachable("unknown InstClass");
break;
case DS_READ: {
- MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
+ MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI, Paired);
CI.setMI(NewMI, *TII, *STM);
break;
}
case DS_WRITE: {
- MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
+ MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI, Paired);
CI.setMI(NewMI, *TII, *STM);
break;
}
case S_BUFFER_LOAD_IMM: {
- MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
+ MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired);
CI.setMI(NewMI, *TII, *STM);
- OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
+ OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
break;
}
case BUFFER_LOAD: {
- MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
+ MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI, Paired);
CI.setMI(NewMI, *TII, *STM);
- OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+ OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case BUFFER_STORE: {
- MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
+ MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI, Paired);
CI.setMI(NewMI, *TII, *STM);
- OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+ OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case MIMG: {
- MachineBasicBlock::iterator NewMI = mergeImagePair(CI);
+ MachineBasicBlock::iterator NewMI = mergeImagePair(CI, Paired);
CI.setMI(NewMI, *TII, *STM);
- OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+ OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case TBUFFER_LOAD: {
- MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI);
+ MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI, Paired);
CI.setMI(NewMI, *TII, *STM);
- OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+ OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case TBUFFER_STORE: {
- MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI);
+ MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI, Paired);
CI.setMI(NewMI, *TII, *STM);
- OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+ OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
}
More information about the llvm-commits
mailing list