[llvm] r317752 - AMDGPU: Merge BUFFER_LOAD_DWORD_OFFEN into x2, x4
Marek Olsak via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 8 17:52:30 PST 2017
Author: mareko
Date: Wed Nov 8 17:52:30 2017
New Revision: 317752
URL: http://llvm.org/viewvc/llvm-project?rev=317752&view=rev
Log:
AMDGPU: Merge BUFFER_LOAD_DWORD_OFFEN into x2, x4
Summary:
-9.9% code size decrease in affected shaders.
Totals (changed stats only):
SGPRS: 2151462 -> 2170646 (0.89 %)
VGPRS: 1634612 -> 1640288 (0.35 %)
Spilled SGPRs: 8942 -> 8940 (-0.02 %)
Code Size: 52940672 -> 51727288 (-2.29 %) bytes
Max Waves: 373066 -> 371718 (-0.36 %)
Totals from affected shaders:
SGPRS: 283520 -> 302704 (6.77 %)
VGPRS: 227632 -> 233308 (2.49 %)
Spilled SGPRs: 3966 -> 3964 (-0.05 %)
Code Size: 12203080 -> 10989696 (-9.94 %) bytes
Max Waves: 44070 -> 42722 (-3.06 %)
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38950
Modified:
llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp?rev=317752&r1=317751&r2=317752&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp Wed Nov 8 17:52:30 2017
@@ -75,6 +75,12 @@ using namespace llvm;
namespace {
class SILoadStoreOptimizer : public MachineFunctionPass {
+ enum InstClassEnum {
+ DS_READ_WRITE,
+ S_BUFFER_LOAD_IMM,
+ BUFFER_LOAD_OFFEN,
+ };
+
struct CombineInfo {
MachineBasicBlock::iterator I;
MachineBasicBlock::iterator Paired;
@@ -82,10 +88,12 @@ class SILoadStoreOptimizer : public Mach
unsigned Offset0;
unsigned Offset1;
unsigned BaseOff;
+ InstClassEnum InstClass;
bool GLC0;
bool GLC1;
+ bool SLC0;
+ bool SLC1;
bool UseST64;
- bool IsSBufferLoadImm;
bool IsX2;
SmallVector<MachineInstr*, 8> InstsToMove;
};
@@ -104,6 +112,7 @@ private:
MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeBufferLoadOffenPair(CombineInfo &CI);
public:
static char ID;
@@ -222,11 +231,13 @@ bool SILoadStoreOptimizer::offsetsCanBeC
CI.BaseOff = 0;
// SMEM offsets must be consecutive.
- if (CI.IsSBufferLoadImm) {
+ if (CI.InstClass == S_BUFFER_LOAD_IMM ||
+ CI.InstClass == BUFFER_LOAD_OFFEN) {
unsigned Diff = CI.IsX2 ? 2 : 1;
return (EltOffset0 + Diff == EltOffset1 ||
EltOffset1 + Diff == EltOffset0) &&
- CI.GLC0 == CI.GLC1;
+ CI.GLC0 == CI.GLC1 &&
+ (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
}
// If the offset in elements doesn't fit in 8-bits, we might be able to use
@@ -271,20 +282,38 @@ bool SILoadStoreOptimizer::findMatchingI
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = CI.I;
- unsigned AddrOpName;
- if (CI.IsSBufferLoadImm)
- AddrOpName = AMDGPU::OpName::sbase;
- else
- AddrOpName = AMDGPU::OpName::addr;
-
- int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName);
- const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
-
- // We only ever merge operations with the same base address register, so don't
- // bother scanning forward if there are no other uses.
- if (TargetRegisterInfo::isPhysicalRegister(AddrReg0.getReg()) ||
- MRI->hasOneNonDBGUse(AddrReg0.getReg()))
- return false;
+ unsigned AddrOpName[3] = {0};
+ int AddrIdx[3];
+ const MachineOperand *AddrReg[3];
+ unsigned NumAddresses = 0;
+
+ switch (CI.InstClass) {
+ case DS_READ_WRITE:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
+ break;
+ case S_BUFFER_LOAD_IMM:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
+ break;
+ case BUFFER_LOAD_OFFEN:
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
+ break;
+ default:
+ llvm_unreachable("invalid InstClass");
+ }
+
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
+ AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
+
+ // We only ever merge operations with the same base address register, so don't
+ // bother scanning forward if there are no other uses.
+ if (AddrReg[i]->isReg() &&
+ (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
+ MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
+ return false;
+ }
++MBBI;
@@ -335,24 +364,45 @@ bool SILoadStoreOptimizer::findMatchingI
if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
continue;
- const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+ bool Match = true;
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
+
+ if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
+ if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
+ AddrReg[i]->getImm() != AddrRegNext.getImm()) {
+ Match = false;
+ break;
+ }
+ continue;
+ }
+
+ // Check same base pointer. Be careful of subregisters, which can occur with
+ // vectors of pointers.
+ if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
+ AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
+ Match = false;
+ break;
+ }
+ }
- // Check same base pointer. Be careful of subregisters, which can occur with
- // vectors of pointers.
- if (AddrReg0.getReg() == AddrReg1.getReg() &&
- AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+ if (Match) {
int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
AMDGPU::OpName::offset);
CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
CI.Paired = MBBI;
- if (CI.IsSBufferLoadImm) {
- CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
- CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
- } else {
+ if (CI.InstClass == DS_READ_WRITE) {
CI.Offset0 &= 0xffff;
CI.Offset1 &= 0xffff;
+ } else {
+ CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
+ CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
+ if (CI.InstClass == BUFFER_LOAD_OFFEN) {
+ CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
+ CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
+ }
}
// Check both offsets fit in the reduced range.
@@ -565,6 +615,55 @@ MachineBasicBlock::iterator SILoadStoreO
return Next;
}
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadOffenPair(
+ CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ unsigned Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
+ AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+
+ const TargetRegisterClass *SuperRC =
+ CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+
+ BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+ unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+ // Handle descending offsets
+ if (CI.Offset0 > CI.Offset1)
+ std::swap(SubRegIdx0, SubRegIdx1);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ moveInstsAfter(Copy1, CI.InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(CI.I);
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return Next;
+}
+
// Scan through looking for adjacent LDS operations with constant offsets from
// the same base register. We rely on the scheduler to do the hard work of
// clustering nearby loads, and assume these are all adjacent.
@@ -582,9 +681,9 @@ bool SILoadStoreOptimizer::optimizeBlock
CombineInfo CI;
CI.I = I;
- CI.IsSBufferLoadImm = false;
unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
+ CI.InstClass = DS_READ_WRITE;
CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
if (findMatchingInst(CI)) {
Modified = true;
@@ -596,6 +695,7 @@ bool SILoadStoreOptimizer::optimizeBlock
continue;
}
if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+ CI.InstClass = DS_READ_WRITE;
CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
if (findMatchingInst(CI)) {
Modified = true;
@@ -610,8 +710,8 @@ bool SILoadStoreOptimizer::optimizeBlock
(Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
// EltSize is in units of the offset encoding.
+ CI.InstClass = S_BUFFER_LOAD_IMM;
CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
- CI.IsSBufferLoadImm = true;
CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
if (findMatchingInst(CI)) {
Modified = true;
@@ -619,6 +719,21 @@ bool SILoadStoreOptimizer::optimizeBlock
if (!CI.IsX2)
CreatedX2++;
} else {
+ ++I;
+ }
+ continue;
+ }
+ if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
+ Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) {
+ CI.InstClass = BUFFER_LOAD_OFFEN;
+ CI.EltSize = 4;
+ CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ I = mergeBufferLoadOffenPair(CI);
+ if (!CI.IsX2)
+ CreatedX2++;
+ } else {
++I;
}
continue;
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll?rev=317752&r1=317751&r2=317752&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll Wed Nov 8 17:52:30 2017
@@ -126,8 +126,76 @@ entry:
ret float %val
}
+;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged:
+;CHECK-NEXT: BB#
+;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
+main_body:
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 8
+ %a3 = add i32 %a, 12
+ %a4 = add i32 %a, 16
+ %a5 = add i32 %a, 28
+ %a6 = add i32 %a, 32
+ %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
+ %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0)
+ %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0)
+ %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc:
+;CHECK-NEXT: BB#
+;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
+;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
+;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
+main_body:
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 8
+ %a3 = add i32 %a, 12
+ %a4 = add i32 %a, 16
+ %a5 = add i32 %a, 28
+ %a6 = add i32 %a, 32
+ %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0)
+ %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0)
+ %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1)
+ %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged:
+;CHECK-NEXT: BB#
+;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+;CHECK: s_waitcnt
+define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
+main_body:
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 12
+ %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ %r1 = extractelement <2 x float> %vr1, i32 0
+ %r2 = extractelement <2 x float> %vr1, i32 1
+ %r3 = extractelement <2 x float> %vr2, i32 0
+ %r4 = extractelement <2 x float> %vr2, i32 1
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
+ ret void
+}
+
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
attributes #0 = { nounwind readonly }
Modified: llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/smrd.ll?rev=317752&r1=317751&r2=317752&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/smrd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/smrd.ll Wed Nov 8 17:52:30 2017
@@ -238,6 +238,29 @@ main_body:
ret void
}
+; GCN-LABEL: {{^}}smrd_vgpr_merged:
+; GCN-NEXT: BB#
+; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
+define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 {
+main_body:
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 8
+ %a3 = add i32 %a, 12
+ %a4 = add i32 %a, 16
+ %a5 = add i32 %a, 28
+ %a6 = add i32 %a, 32
+ %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a1)
+ %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a2)
+ %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a3)
+ %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a4)
+ %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a5)
+ %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a6)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
+ ret void
+}
+
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
More information about the llvm-commits
mailing list