[llvm] r348937 - [AMDGPU] Extend the SI Load/Store optimizer to combine more things.
Neil Henning via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 12 08:15:21 PST 2018
Author: sheredom
Date: Wed Dec 12 08:15:21 2018
New Revision: 348937
URL: http://llvm.org/viewvc/llvm-project?rev=348937&view=rev
Log:
[AMDGPU] Extend the SI Load/Store optimizer to combine more things.
I've extended the load/store optimizer to be able to produce dwordx3
loads and stores, This change allows many more load/stores to be combined,
and results in much more optimal code for our hardware.
Differential Revision: https://reviews.llvm.org/D54042
Added:
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td
llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll
llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll
llvm/trunk/test/CodeGen/AMDGPU/store-global.ll
llvm/trunk/test/CodeGen/AMDGPU/store-v3i64.ll
Modified: llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td Wed Dec 12 08:15:21 2018
@@ -286,6 +286,12 @@ multiclass MTBUF_Pseudo_Stores<string op
// MUBUF classes
//===----------------------------------------------------------------------===//
+class MUBUFGetBaseOpcode<string Op> {
+ string ret = !subst("DWORDX2", "DWORD",
+ !subst("DWORDX3", "DWORD",
+ !subst("DWORDX4", "DWORD", Op)));
+}
+
class MUBUF_Pseudo <string opName, dag outs, dag ins,
string asmOps, list<dag> pattern=[]> :
InstSI<outs, ins, "", pattern>,
@@ -299,6 +305,9 @@ class MUBUF_Pseudo <string opName, dag o
string Mnemonic = opName;
string AsmOperands = asmOps;
+ Instruction Opcode = !cast<Instruction>(NAME);
+ Instruction BaseOpcode = !cast<Instruction>(MUBUFGetBaseOpcode<NAME>.ret);
+
let VM_CNT = 1;
let EXP_CNT = 1;
let MUBUF = 1;
@@ -321,6 +330,7 @@ class MUBUF_Pseudo <string opName, dag o
bits<1> has_offset = 1;
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
+ bits<4> dwords = 0;
}
class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
@@ -394,6 +404,16 @@ class getMUBUFInsDA<list<RegisterClass>
);
}
+class getMUBUFDwords<RegisterClass regClass> {
+ string regClassAsInt = !cast<string>(regClass);
+ int ret =
+ !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1,
+ !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2,
+ !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3,
+ !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4,
+ 0))));
+}
+
class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
dag ret =
!if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret,
@@ -454,6 +474,7 @@ class MUBUF_Load_Pseudo <string opName,
let Uses = !if(isLds, [EXEC, M0], [EXEC]);
let has_tfe = !if(isLds, 0, 1);
let lds = isLds;
+ let dwords = getMUBUFDwords<vdataClass>.ret;
}
// FIXME: tfe can't be an operand because it requires a separate
@@ -517,6 +538,7 @@ class MUBUF_Store_Pseudo <string opName,
let mayLoad = 0;
let mayStore = 1;
let maybeAtomic = 1;
+ let dwords = getMUBUFDwords<vdataClass>.ret;
}
multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
@@ -2068,3 +2090,22 @@ let SubtargetPredicate = HasPackedD16VMe
defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>;
defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>;
} // End HasUnpackedD16VMem.
+
+def MUBUFInfoTable : GenericTable {
+ let FilterClass = "MUBUF_Pseudo";
+ let CppTypeName = "MUBUFInfo";
+ let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getMUBUFOpcodeHelper";
+}
+
+def getMUBUFInfoFromOpcode : SearchIndex {
+ let Table = MUBUFInfoTable;
+ let Key = ["Opcode"];
+}
+
+def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex {
+ let Table = MUBUFInfoTable;
+ let Key = ["BaseOpcode", "dwords"];
+}
Modified: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp Wed Dec 12 08:15:21 2018
@@ -43,9 +43,9 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
@@ -74,23 +74,38 @@ using namespace llvm;
#define DEBUG_TYPE "si-load-store-opt"
namespace {
+enum InstClassEnum {
+ UNKNOWN,
+ DS_READ,
+ DS_WRITE,
+ S_BUFFER_LOAD_IMM,
+ BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
+ BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+ BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
+ BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+ BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
+ BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
+ BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
+ BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
+};
-class SILoadStoreOptimizer : public MachineFunctionPass {
- enum InstClassEnum {
- DS_READ_WRITE,
- S_BUFFER_LOAD_IMM,
- BUFFER_LOAD_OFFEN,
- BUFFER_LOAD_OFFSET,
- BUFFER_STORE_OFFEN,
- BUFFER_STORE_OFFSET,
- };
+enum RegisterEnum {
+ SBASE = 0x1,
+ SRSRC = 0x2,
+ SOFFSET = 0x4,
+ VADDR = 0x8,
+ ADDR = 0x10,
+};
+class SILoadStoreOptimizer : public MachineFunctionPass {
struct CombineInfo {
MachineBasicBlock::iterator I;
MachineBasicBlock::iterator Paired;
unsigned EltSize;
unsigned Offset0;
unsigned Offset1;
+ unsigned Width0;
+ unsigned Width1;
unsigned BaseOff;
InstClassEnum InstClass;
bool GLC0;
@@ -98,9 +113,8 @@ class SILoadStoreOptimizer : public Mach
bool SLC0;
bool SLC1;
bool UseST64;
- bool IsX2;
- SmallVector<MachineInstr*, 8> InstsToMove;
- };
+ SmallVector<MachineInstr *, 8> InstsToMove;
+ };
private:
const GCNSubtarget *STM = nullptr;
@@ -108,9 +122,16 @@ private:
const SIRegisterInfo *TRI = nullptr;
MachineRegisterInfo *MRI = nullptr;
AliasAnalysis *AA = nullptr;
- unsigned CreatedX2;
+ bool OptimizeAgain;
static bool offsetsCanBeCombined(CombineInfo &CI);
+ static bool widthsFit(const CombineInfo &CI);
+ static unsigned getNewOpcode(const CombineInfo &CI);
+ static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
+ const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
+ unsigned getOpcodeWidth(const MachineInstr &MI);
+ InstClassEnum getInstClass(unsigned Opc);
+ unsigned getRegs(unsigned Opc);
bool findMatchingInst(CombineInfo &CI);
@@ -123,8 +144,6 @@ private:
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
- unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
- bool &IsOffen) const;
MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
public:
@@ -153,8 +172,8 @@ public:
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load Store Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
- "SI Load Store Optimizer", false, false)
+INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
+ false, false)
char SILoadStoreOptimizer::ID = 0;
@@ -165,7 +184,7 @@ FunctionPass *llvm::createSILoadStoreOpt
}
static void moveInstsAfter(MachineBasicBlock::iterator I,
- ArrayRef<MachineInstr*> InstsToMove) {
+ ArrayRef<MachineInstr *> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
++I;
for (MachineInstr *MI : InstsToMove) {
@@ -191,21 +210,19 @@ static void addDefsUsesToList(const Mach
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
MachineBasicBlock::iterator B,
const SIInstrInfo *TII,
- AliasAnalysis * AA) {
+ AliasAnalysis *AA) {
// RAW or WAR - cannot reorder
// WAW - cannot reorder
// RAR - safe to reorder
return !(A->mayStore() || B->mayStore()) ||
- TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
+ TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
}
// Add MI and its defs to the lists if MI reads one of the defs that are
// already in the list. Returns true in that case.
-static bool
-addToListsIfDependent(MachineInstr &MI,
- DenseSet<unsigned> &RegDefs,
- DenseSet<unsigned> &PhysRegUses,
- SmallVectorImpl<MachineInstr*> &Insts) {
+static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
+ DenseSet<unsigned> &PhysRegUses,
+ SmallVectorImpl<MachineInstr *> &Insts) {
for (MachineOperand &Use : MI.operands()) {
// If one of the defs is read, then there is a use of Def between I and the
// instruction that I will potentially be merged with. We will need to move
@@ -228,18 +245,16 @@ addToListsIfDependent(MachineInstr &MI,
return false;
}
-static bool
-canMoveInstsAcrossMemOp(MachineInstr &MemOp,
- ArrayRef<MachineInstr*> InstsToMove,
- const SIInstrInfo *TII,
- AliasAnalysis *AA) {
+static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+ ArrayRef<MachineInstr *> InstsToMove,
+ const SIInstrInfo *TII, AliasAnalysis *AA) {
assert(MemOp.mayLoadOrStore());
for (MachineInstr *InstToMove : InstsToMove) {
if (!InstToMove->mayLoadOrStore())
continue;
if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
- return false;
+ return false;
}
return true;
}
@@ -260,10 +275,9 @@ bool SILoadStoreOptimizer::offsetsCanBeC
CI.BaseOff = 0;
// Handle SMEM and VMEM instructions.
- if (CI.InstClass != DS_READ_WRITE) {
- unsigned Diff = CI.IsX2 ? 2 : 1;
- return (EltOffset0 + Diff == EltOffset1 ||
- EltOffset1 + Diff == EltOffset0) &&
+ if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
+ return (EltOffset0 + CI.Width0 == EltOffset1 ||
+ EltOffset1 + CI.Width1 == EltOffset0) &&
CI.GLC0 == CI.GLC1 &&
(CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
}
@@ -305,42 +319,175 @@ bool SILoadStoreOptimizer::offsetsCanBeC
return false;
}
+bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) {
+ const unsigned Width = (CI.Width0 + CI.Width1);
+ switch (CI.InstClass) {
+ default:
+ return Width <= 4;
+ case S_BUFFER_LOAD_IMM:
+ switch (Width) {
+ default:
+ return false;
+ case 2:
+ case 4:
+ return true;
+ }
+ }
+}
+
+unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
+ const unsigned Opc = MI.getOpcode();
+
+ if (TII->isMUBUF(MI)) {
+ return AMDGPU::getMUBUFDwords(Opc);
+ }
+
+ switch (Opc) {
+ default:
+ return 0;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ return 1;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ return 2;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return 4;
+ }
+}
+
+InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
+ if (TII->isMUBUF(Opc)) {
+ const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
+
+ // If we couldn't identify the opcode, bail out.
+ if (baseOpcode == -1) {
+ return UNKNOWN;
+ }
+
+ switch (baseOpcode) {
+ default:
+ return UNKNOWN;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+ return BUFFER_LOAD_OFFEN;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+ return BUFFER_LOAD_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ return BUFFER_STORE_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+ return BUFFER_STORE_OFFSET;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
+ return BUFFER_LOAD_OFFEN_exact;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+ return BUFFER_LOAD_OFFSET_exact;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+ return BUFFER_STORE_OFFEN_exact;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ return BUFFER_STORE_OFFSET_exact;
+ }
+ }
+
+ switch (Opc) {
+ default:
+ return UNKNOWN;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return S_BUFFER_LOAD_IMM;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64_gfx9:
+ return DS_READ;
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return DS_WRITE;
+ }
+}
+
+unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
+ if (TII->isMUBUF(Opc)) {
+ unsigned result = 0;
+
+ if (AMDGPU::getMUBUFHasVAddr(Opc)) {
+ result |= VADDR;
+ }
+
+ if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
+ result |= SRSRC;
+ }
+
+ if (AMDGPU::getMUBUFHasSoffset(Opc)) {
+ result |= SOFFSET;
+ }
+
+ return result;
+ }
+
+ switch (Opc) {
+ default:
+ return 0;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return SBASE;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64_gfx9:
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return ADDR;
+ }
+}
+
bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = CI.I;
- unsigned AddrOpName[3] = {0};
- int AddrIdx[3];
- const MachineOperand *AddrReg[3];
+ const unsigned Opc = CI.I->getOpcode();
+ const InstClassEnum InstClass = getInstClass(Opc);
+
+ if (InstClass == UNKNOWN) {
+ return false;
+ }
+
+ const unsigned Regs = getRegs(Opc);
+
+ unsigned AddrOpName[5] = {0};
+ int AddrIdx[5];
+ const MachineOperand *AddrReg[5];
unsigned NumAddresses = 0;
- switch (CI.InstClass) {
- case DS_READ_WRITE:
+ if (Regs & ADDR) {
AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
- break;
- case S_BUFFER_LOAD_IMM:
+ }
+
+ if (Regs & SBASE) {
AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
- break;
- case BUFFER_LOAD_OFFEN:
- case BUFFER_STORE_OFFEN:
- AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
- AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
- AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
- break;
- case BUFFER_LOAD_OFFSET:
- case BUFFER_STORE_OFFSET:
+ }
+
+ if (Regs & SRSRC) {
AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+ }
+
+ if (Regs & SOFFSET) {
AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
- break;
+ }
+
+ if (Regs & VADDR) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
}
for (unsigned i = 0; i < NumAddresses; i++) {
AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
- // We only ever merge operations with the same base address register, so don't
- // bother scanning forward if there are no other uses.
+ // We only ever merge operations with the same base address register, so
+ // don't bother scanning forward if there are no other uses.
if (AddrReg[i]->isReg() &&
(TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
@@ -353,8 +500,11 @@ bool SILoadStoreOptimizer::findMatchingI
DenseSet<unsigned> PhysRegUsesToMove;
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
- for ( ; MBBI != E; ++MBBI) {
- if (MBBI->getOpcode() != CI.I->getOpcode()) {
+ for (; MBBI != E; ++MBBI) {
+ const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
+
+ if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
+ (IsDS && (MBBI->getOpcode() != Opc))) {
// This is not a matching DS instruction, but we can keep looking as
// long as one of these conditions are met:
// 1. It is safe to move I down past MBBI.
@@ -368,8 +518,8 @@ bool SILoadStoreOptimizer::findMatchingI
}
if (MBBI->mayLoadOrStore() &&
- (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
+ (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
// We fail condition #1, but we may still be able to satisfy condition
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
@@ -413,8 +563,8 @@ bool SILoadStoreOptimizer::findMatchingI
continue;
}
- // Check same base pointer. Be careful of subregisters, which can occur with
- // vectors of pointers.
+ // Check same base pointer. Be careful of subregisters, which can occur
+ // with vectors of pointers.
if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
Match = false;
@@ -423,13 +573,15 @@ bool SILoadStoreOptimizer::findMatchingI
}
if (Match) {
- int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
- AMDGPU::OpName::offset);
+ int OffsetIdx =
+ AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
+ CI.Width0 = getOpcodeWidth(*CI.I);
CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
+ CI.Width1 = getOpcodeWidth(*MBBI);
CI.Paired = MBBI;
- if (CI.InstClass == DS_READ_WRITE) {
+ if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
CI.Offset0 &= 0xffff;
CI.Offset1 &= 0xffff;
} else {
@@ -445,7 +597,7 @@ bool SILoadStoreOptimizer::findMatchingI
// We also need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
- if (offsetsCanBeCombined(CI))
+ if (widthsFit(CI) && offsetsCanBeCombined(CI))
if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
return true;
}
@@ -472,12 +624,12 @@ unsigned SILoadStoreOptimizer::read2ST64
if (STM->ldsRequiresM0Init())
return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
- return (EltSize == 4) ?
- AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
+ return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
+ : AMDGPU::DS_READ2ST64_B64_gfx9;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
- CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@@ -489,8 +641,8 @@ MachineBasicBlock::iterator SILoadStore
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = CI.UseST64 ?
- read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
+ unsigned Opc =
+ CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
@@ -502,13 +654,12 @@ MachineBasicBlock::iterator SILoadStore
}
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
- (NewOffset0 != NewOffset1) &&
- "Computed offset doesn't fit");
+ (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
const MCInstrDesc &Read2Desc = TII->get(Opc);
- const TargetRegisterClass *SuperRC
- = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+ const TargetRegisterClass *SuperRC =
+ (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
DebugLoc DL = CI.I->getDebugLoc();
@@ -519,23 +670,24 @@ MachineBasicBlock::iterator SILoadStore
if (CI.BaseOff) {
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
- .addImm(CI.BaseOff);
+ .addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
- .addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ .addReg(ImmReg)
+ .addReg(AddrReg->getReg(), 0, BaseSubReg);
BaseSubReg = 0;
}
- MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
- .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
- .addImm(NewOffset0) // offset0
- .addImm(NewOffset1) // offset1
- .addImm(0) // gds
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+ MachineInstrBuilder Read2 =
+ BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
+ .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
+ .addImm(NewOffset0) // offset0
+ .addImm(NewOffset1) // offset1
+ .addImm(0) // gds
+ .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
(void)Read2;
@@ -562,32 +714,36 @@ MachineBasicBlock::iterator SILoadStore
unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
if (STM->ldsRequiresM0Init())
return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
- return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
+ : AMDGPU::DS_WRITE2_B64_gfx9;
}
unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
if (STM->ldsRequiresM0Init())
- return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
+ : AMDGPU::DS_WRITE2ST64_B64;
- return (EltSize == 4) ?
- AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
+ : AMDGPU::DS_WRITE2ST64_B64_gfx9;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
- CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
// sure we preserve the subregister index and any register flags set on them.
- const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
- const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
- const MachineOperand *Data1
- = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
+ const MachineOperand *AddrReg =
+ TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+ const MachineOperand *Data0 =
+ TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
+ const MachineOperand *Data1 =
+ TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = CI.UseST64 ?
- write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+ unsigned Opc =
+ CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
if (NewOffset0 > NewOffset1) {
// Canonicalize the merged instruction so the smaller offset comes first.
@@ -596,8 +752,7 @@ MachineBasicBlock::iterator SILoadStoreO
}
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
- (NewOffset0 != NewOffset1) &&
- "Computed offset doesn't fit");
+ (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = CI.I->getDebugLoc();
@@ -608,25 +763,26 @@ MachineBasicBlock::iterator SILoadStoreO
if (CI.BaseOff) {
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
- .addImm(CI.BaseOff);
+ .addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
- .addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ .addReg(ImmReg)
+ .addReg(AddrReg->getReg(), 0, BaseSubReg);
BaseSubReg = 0;
}
- MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc)
- .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
- .add(*Data0) // data0
- .add(*Data1) // data1
- .addImm(NewOffset0) // offset0
- .addImm(NewOffset1) // offset1
- .addImm(0) // gds
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+ MachineInstrBuilder Write2 =
+ BuildMI(*MBB, CI.Paired, DL, Write2Desc)
+ .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
+ .add(*Data0) // data0
+ .add(*Data1) // data1
+ .addImm(NewOffset0) // offset0
+ .addImm(NewOffset1) // offset1
+ .addImm(0) // gds
+ .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
moveInstsAfter(Write2, CI.InstsToMove);
@@ -638,15 +794,14 @@ MachineBasicBlock::iterator SILoadStoreO
return Next;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
- CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
- AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ const unsigned Opcode = getNewOpcode(CI);
+
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
- const TargetRegisterClass *SuperRC =
- CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
@@ -656,12 +811,9 @@ MachineBasicBlock::iterator SILoadStoreO
.addImm(CI.GLC0) // glc
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
- unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
- unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
-
- // Handle descending offsets
- if (CI.Offset0 > CI.Offset1)
- std::swap(SubRegIdx0, SubRegIdx1);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
@@ -683,29 +835,25 @@ MachineBasicBlock::iterator SILoadStoreO
return Next;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
- CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- unsigned Opcode;
- if (CI.InstClass == BUFFER_LOAD_OFFEN) {
- Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
- AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
- } else {
- Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
- AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
- }
+ const unsigned Opcode = getNewOpcode(CI);
- const TargetRegisterClass *SuperRC =
- CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+
+ // Copy to the new source register.
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
- if (CI.InstClass == BUFFER_LOAD_OFFEN)
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+ const unsigned Regs = getRegs(Opcode);
+
+ if (Regs & VADDR)
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -715,12 +863,9 @@ MachineBasicBlock::iterator SILoadStoreO
.addImm(0) // tfe
.cloneMergedMemRefs({&*CI.I, &*CI.Paired});
- unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
- unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
-
- // Handle descending offsets
- if (CI.Offset0 > CI.Offset1)
- std::swap(SubRegIdx0, SubRegIdx1);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
@@ -742,57 +887,137 @@ MachineBasicBlock::iterator SILoadStoreO
return Next;
}
-unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
- const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
- IsX2 = false;
- IsOffen = false;
-
- switch (I.getOpcode()) {
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
- IsOffen = true;
- return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
- IsOffen = true;
- return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
- case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
- IsX2 = true;
- IsOffen = true;
- return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
- case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
- IsX2 = true;
- IsOffen = true;
- return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
- return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
- return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
- case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
- IsX2 = true;
- return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
- case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
- IsX2 = true;
- return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
+unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
+ const unsigned Width = CI.Width0 + CI.Width1;
+
+ switch (CI.InstClass) {
+ default:
+ return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
+ case UNKNOWN:
+ llvm_unreachable("Unknown instruction class");
+ case S_BUFFER_LOAD_IMM:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ case 4:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+ }
}
- return 0;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
- CombineInfo &CI) {
+std::pair<unsigned, unsigned>
+SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
+ if (CI.Offset0 > CI.Offset1) {
+ switch (CI.Width0) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
+ case 2:
+ return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
+ case 3:
+ return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
+ }
+ case 2:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
+ case 2:
+ return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
+ }
+ case 3:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
+ }
+ }
+ } else {
+ switch (CI.Width0) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
+ case 2:
+ return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
+ case 3:
+ return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
+ }
+ case 2:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
+ case 2:
+ return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
+ }
+ case 3:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
+ }
+ }
+ }
+}
+
+const TargetRegisterClass *
+SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
+ if (CI.InstClass == S_BUFFER_LOAD_IMM) {
+ switch (CI.Width0 + CI.Width1) {
+ default:
+ return nullptr;
+ case 2:
+ return &AMDGPU::SReg_64_XEXECRegClass;
+ case 4:
+ return &AMDGPU::SReg_128RegClass;
+ case 8:
+ return &AMDGPU::SReg_256RegClass;
+ case 16:
+ return &AMDGPU::SReg_512RegClass;
+ }
+ } else {
+ switch (CI.Width0 + CI.Width1) {
+ default:
+ return nullptr;
+ case 2:
+ return &AMDGPU::VReg_64RegClass;
+ case 3:
+ return &AMDGPU::VReg_96RegClass;
+ case 4:
+ return &AMDGPU::VReg_128RegClass;
+ }
+ }
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- bool Unused1, Unused2;
- unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
- unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
- unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+ const unsigned Opcode = getNewOpcode(CI);
- // Handle descending offsets
- if (CI.Offset0 > CI.Offset1)
- std::swap(SubRegIdx0, SubRegIdx1);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the new source register.
- const TargetRegisterClass *SuperRC =
- CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
@@ -805,10 +1030,12 @@ MachineBasicBlock::iterator SILoadStoreO
.addImm(SubRegIdx1);
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
- .addReg(SrcReg, RegState::Kill);
+ .addReg(SrcReg, RegState::Kill);
- if (CI.InstClass == BUFFER_STORE_OFFEN)
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+ const unsigned Regs = getRegs(Opcode);
+
+ if (Regs & VADDR)
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -841,90 +1068,69 @@ bool SILoadStoreOptimizer::optimizeBlock
continue;
}
+ const unsigned Opc = MI.getOpcode();
+
CombineInfo CI;
CI.I = I;
- unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
- Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
+ CI.InstClass = getInstClass(Opc);
- CI.InstClass = DS_READ_WRITE;
+ switch (CI.InstClass) {
+ default:
+ break;
+ case DS_READ:
CI.EltSize =
- (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
-
+ (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
+ : 4;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeRead2Pair(CI);
} else {
++I;
}
-
continue;
- } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
- Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
- Opc == AMDGPU::DS_WRITE_B64_gfx9) {
- CI.InstClass = DS_READ_WRITE;
- CI.EltSize
- = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
-
+ case DS_WRITE:
+ CI.EltSize =
+ (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
+ : 4;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeWrite2Pair(CI);
} else {
++I;
}
-
continue;
- }
- if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
- Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
- // EltSize is in units of the offset encoding.
- CI.InstClass = S_BUFFER_LOAD_IMM;
+ case S_BUFFER_LOAD_IMM:
CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
- CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeSBufferLoadImmPair(CI);
- if (!CI.IsX2)
- CreatedX2++;
+ OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
} else {
++I;
}
continue;
- }
- if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
- Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
- Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
- Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
- if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
- Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
- CI.InstClass = BUFFER_LOAD_OFFEN;
- else
- CI.InstClass = BUFFER_LOAD_OFFSET;
-
+ case BUFFER_LOAD_OFFEN:
+ case BUFFER_LOAD_OFFSET:
+ case BUFFER_LOAD_OFFEN_exact:
+ case BUFFER_LOAD_OFFSET_exact:
CI.EltSize = 4;
- CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
- Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeBufferLoadPair(CI);
- if (!CI.IsX2)
- CreatedX2++;
+ OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
} else {
++I;
}
continue;
- }
-
- bool StoreIsX2, IsOffen;
- if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
- CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
+ case BUFFER_STORE_OFFEN:
+ case BUFFER_STORE_OFFSET:
+ case BUFFER_STORE_OFFEN_exact:
+ case BUFFER_STORE_OFFSET_exact:
CI.EltSize = 4;
- CI.IsX2 = StoreIsX2;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeBufferStorePair(CI);
- if (!CI.IsX2)
- CreatedX2++;
+ OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
} else {
++I;
}
@@ -958,12 +1164,10 @@ bool SILoadStoreOptimizer::runOnMachineF
bool Modified = false;
for (MachineBasicBlock &MBB : MF) {
- CreatedX2 = 0;
- Modified |= optimizeBlock(MBB);
-
- // Run again to convert x2 to x4.
- if (CreatedX2 >= 1)
+ do {
+ OptimizeAgain = false;
Modified |= optimizeBlock(MBB);
+ } while (OptimizeAgain);
}
return Modified;
Modified: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp Wed Dec 12 08:15:21 2018
@@ -128,6 +128,49 @@ int getMaskedMIMGOp(unsigned Opc, unsign
return NewInfo ? NewInfo->Opcode : -1;
}
+struct MUBUFInfo {
+ uint16_t Opcode;
+ uint16_t BaseOpcode;
+ uint8_t dwords;
+ bool has_vaddr;
+ bool has_srsrc;
+ bool has_soffset;
+};
+
+#define GET_MUBUFInfoTable_DECL
+#define GET_MUBUFInfoTable_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+
+int getMUBUFBaseOpcode(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc);
+ return Info ? Info->BaseOpcode : -1;
+}
+
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) {
+ const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords);
+ return Info ? Info->Opcode : -1;
+}
+
+int getMUBUFDwords(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->dwords : 0;
+}
+
+bool getMUBUFHasVAddr(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->has_vaddr : false;
+}
+
+bool getMUBUFHasSrsrc(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->has_srsrc : false;
+}
+
+bool getMUBUFHasSoffset(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->has_soffset : false;
+}
+
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
// header files, so we need to wrap it in a function that takes unsigned
// instead.
Modified: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h Wed Dec 12 08:15:21 2018
@@ -221,6 +221,24 @@ LLVM_READONLY
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
LLVM_READONLY
+int getMUBUFBaseOpcode(unsigned Opc);
+
+LLVM_READONLY
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords);
+
+LLVM_READONLY
+int getMUBUFDwords(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasVAddr(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasSrsrc(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasSoffset(unsigned Opc);
+
+LLVM_READONLY
int getMCOpcode(uint16_t Opcode, unsigned Gen);
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Modified: llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll Wed Dec 12 08:15:21 2018
@@ -36,10 +36,10 @@ define amdgpu_kernel void @load_v2i8_to_
; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
; GCN-NOT: v_cvt_f32_ubyte3_e32
-; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
-; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
+; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
+; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[VAL]]
; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; GCN: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
Modified: llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll Wed Dec 12 08:15:21 2018
@@ -60,8 +60,7 @@ endif:
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
-; GCN-DAG: buffer_store_dword v
-; GCN-DAG: buffer_store_dwordx2
+; GCN-DAG: buffer_store_dwordx3
define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
entry:
%v = load <3 x i32>, <3 x i32> addrspace(1)* %in
Modified: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll Wed Dec 12 08:15:21 2018
@@ -103,8 +103,7 @@ define amdgpu_kernel void @dynamic_inser
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
-; GCN-DAG: buffer_store_dwordx2 v
-; GCN-DAG: buffer_store_dword v
+; GCN-DAG: buffer_store_dwordx3 v
define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
%vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
@@ -173,8 +172,7 @@ define amdgpu_kernel void @dynamic_inser
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
-; GCN-DAG: buffer_store_dwordx2 v
-; GCN-DAG: buffer_store_dword v
+; GCN-DAG: buffer_store_dwordx3 v
define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
%vecins = insertelement <3 x i32> %a, i32 5, i32 %b
store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll Wed Dec 12 08:15:21 2018
@@ -193,6 +193,22 @@ main_body:
ret void
}
+;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
+main_body:
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 12
+ %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ %r1 = extractelement <2 x float> %vr1, i32 0
+ %r2 = extractelement <2 x float> %vr1, i32 1
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
+ ret void
+}
+
;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged:
;CHECK-NEXT: %bb.
;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
@@ -227,6 +243,20 @@ main_body:
ret void
}
+;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged:
+;CHECK-NEXT: %bb.
+;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) {
+main_body:
+ %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
+ %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
+ %r1 = extractelement <2 x float> %vr1, i32 0
+ %r2 = extractelement <2 x float> %vr1, i32 1
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
+ ret void
+}
+
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll Wed Dec 12 08:15:21 2018
@@ -147,6 +147,41 @@ define amdgpu_ps void @buffer_store_x2_o
ret void
}
+;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
+define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) {
+ %a1 = add i32 %a, 28
+ %a2 = add i32 %a, 32
+ %a3 = add i32 %a, 36
+ call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) {
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 12
+ call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) {
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 8
+ call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ ret void
+}
+
;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged:
;CHECK-NOT: s_waitcnt
;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
@@ -164,12 +199,40 @@ define amdgpu_ps void @buffer_store_x1_o
;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) {
+define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) {
call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
ret void
}
+;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged:
+;CHECK-NOT: s_waitcnt
+;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
+define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) {
+ call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2:
+;CHECK-NOT: s_waitcnt
+;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
+define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) {
+ call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3:
+;CHECK-NOT: s_waitcnt
+;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8
+define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) {
+ call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
+ ret void
+}
+
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll?rev=348937&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll Wed Dec 12 08:15:21 2018
@@ -0,0 +1,114 @@
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}s_buffer_load_imm:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4
+define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
+main_body:
+ %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
+ %bitcast = bitcast i32 %load to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_load_index:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) {
+main_body:
+ %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0)
+ %bitcast = bitcast i32 %load to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_loadx2_imm:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40
+define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) {
+main_body:
+ %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0)
+ %bitcast = bitcast <2 x i32> %load to <2 x float>
+ %x = extractelement <2 x float> %bitcast, i32 0
+ %y = extractelement <2 x float> %bitcast, i32 1
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_loadx2_index:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) {
+main_body:
+ %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0)
+ %bitcast = bitcast <2 x i32> %load to <2 x float>
+ %x = extractelement <2 x float> %bitcast, i32 0
+ %y = extractelement <2 x float> %bitcast, i32 1
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_loadx4_imm:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8
+define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) {
+main_body:
+ %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0)
+ %bitcast = bitcast <4 x i32> %load to <4 x float>
+ %x = extractelement <4 x float> %bitcast, i32 0
+ %y = extractelement <4 x float> %bitcast, i32 1
+ %z = extractelement <4 x float> %bitcast, i32 2
+ %w = extractelement <4 x float> %bitcast, i32 3
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_loadx4_index:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) {
+main_body:
+ %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0)
+ %bitcast = bitcast <4 x i32> %load to <4 x float>
+ %x = extractelement <4 x float> %bitcast, i32 0
+ %y = extractelement <4 x float> %bitcast, i32 1
+ %z = extractelement <4 x float> %bitcast, i32 2
+ %w = extractelement <4 x float> %bitcast, i32 3
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex2:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4
+define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
+main_body:
+ %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
+ %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
+ %x = bitcast i32 %load0 to float
+ %y = bitcast i32 %load1 to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex4:
+;CHECK-NOT: s_waitcnt;
+;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8
+define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
+main_body:
+ %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
+ %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0)
+ %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0)
+ %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0)
+ %x = bitcast i32 %load0 to float
+ %y = bitcast i32 %load1 to float
+ %z = bitcast i32 %load2 to float
+ %w = bitcast i32 %load3 to float
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
+ ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
+declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
Modified: llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll Wed Dec 12 08:15:21 2018
@@ -164,8 +164,8 @@ define amdgpu_kernel void @merge_global_
}
; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
-; SI-DAG: buffer_store_dwordx2
-; SI-DAG: buffer_store_dword
+; SI-DAG: buffer_store_dwordx3
+; SI-NOT: buffer_store_dwordx2
; SI-NOT: buffer_store_dword
; GCN: s_endpgm
define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
@@ -274,11 +274,9 @@ define amdgpu_kernel void @merge_global_
}
; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
-; SI-DAG: buffer_load_dwordx2
-; SI-DAG: buffer_load_dword v
+; SI-DAG: buffer_load_dwordx3
; GCN: s_waitcnt
-; SI-DAG: buffer_store_dword v
-; SI-DAG: buffer_store_dwordx2 v
+; SI-DAG: buffer_store_dwordx3 v
; GCN: s_endpgm
define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
@@ -563,8 +561,7 @@ define amdgpu_kernel void @merge_global_
; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dword v
+; GCN: buffer_store_dwordx3
define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
store i32 34, i32 addrspace(1)* %out, align 4
%idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
@@ -611,13 +608,11 @@ define amdgpu_kernel void @merge_global_
; GCN-LABEL: {{^}}copy_v3i32_align4:
; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-NOT: offen
; GCN: s_waitcnt vmcnt
; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN: ScratchSize: 0{{$}}
define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
@@ -644,13 +639,11 @@ define amdgpu_kernel void @copy_v3i64_al
; GCN-LABEL: {{^}}copy_v3f32_align4:
; GCN-NOT: SCRATCH_RSRC_DWORD
-; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-NOT: offen
; GCN: s_waitcnt vmcnt
; GCN-NOT: offen
-; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN: ScratchSize: 0{{$}}
define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
Modified: llvm/trunk/test/CodeGen/AMDGPU/store-global.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/store-global.ll?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/store-global.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/store-global.ll Wed Dec 12 08:15:21 2018
@@ -273,8 +273,7 @@ entry:
}
; FUNC-LABEL: {{^}}store_v3i32:
-; SIVI-DAG: buffer_store_dwordx2
-; SIVI-DAG: buffer_store_dword v
+; SIVI-DAG: buffer_store_dwordx3
; GFX9-DAG: global_store_dwordx2
; GFX9-DAG: global_store_dword v
Modified: llvm/trunk/test/CodeGen/AMDGPU/store-v3i64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/store-v3i64.ll?rev=348937&r1=348936&r2=348937&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/store-v3i64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/store-v3i64.ll Wed Dec 12 08:15:21 2018
@@ -89,8 +89,7 @@ define amdgpu_kernel void @local_store_v
}
; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
-; GCN-DAG: buffer_store_dwordx2
-; GCN-DAG: buffer_store_dword v
+; GCN-DAG: buffer_store_dwordx3
define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
%trunc = trunc <3 x i64> %x to <3 x i32>
store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
More information about the llvm-commits
mailing list