[llvm] r317755 - AMDGPU: Merge BUFFER_STORE_DWORD_OFFEN/OFFSET into x2, x4
Marek Olsak via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 8 17:52:55 PST 2017
Author: mareko
Date: Wed Nov 8 17:52:55 2017
New Revision: 317755
URL: http://llvm.org/viewvc/llvm-project?rev=317755&view=rev
Log:
AMDGPU: Merge BUFFER_STORE_DWORD_OFFEN/OFFSET into x2, x4
Summary:
Only 56 shaders (out of 48486) are affected.
Totals from affected shaders (changed stats only):
SGPRS: 2420 -> 2460 (1.65 %)
Spilled VGPRs: 94 -> 112 (19.15 %)
Scratch size: 524 -> 528 (0.76 %) dwords per thread
Code Size: 187400 -> 184992 (-1.28 %) bytes
One DiRT Showdown shader spills 6 more VGPRs.
One Grid Autosport shader spills 12 more VGPRs.
The other 54 shaders only have a decrease in code size.
(I'm ignoring the SGPR noise)
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D39012
Modified:
llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp?rev=317755&r1=317754&r2=317755&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp Wed Nov 8 17:52:55 2017
@@ -80,6 +80,8 @@ class SILoadStoreOptimizer : public Mach
S_BUFFER_LOAD_IMM,
BUFFER_LOAD_OFFEN,
BUFFER_LOAD_OFFSET,
+ BUFFER_STORE_OFFEN,
+ BUFFER_STORE_OFFSET,
};
struct CombineInfo {
@@ -114,6 +116,9 @@ private:
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
+ unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
+ bool &IsOffen) const;
+ MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
public:
static char ID;
@@ -231,10 +236,8 @@ bool SILoadStoreOptimizer::offsetsCanBeC
CI.UseST64 = false;
CI.BaseOff = 0;
- // SMEM offsets must be consecutive.
- if (CI.InstClass == S_BUFFER_LOAD_IMM ||
- CI.InstClass == BUFFER_LOAD_OFFEN ||
- CI.InstClass == BUFFER_LOAD_OFFSET) {
+ // Handle SMEM and VMEM instructions.
+ if (CI.InstClass != DS_READ_WRITE) {
unsigned Diff = CI.IsX2 ? 2 : 1;
return (EltOffset0 + Diff == EltOffset1 ||
EltOffset1 + Diff == EltOffset0) &&
@@ -297,11 +300,13 @@ bool SILoadStoreOptimizer::findMatchingI
AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
break;
case BUFFER_LOAD_OFFEN:
+ case BUFFER_STORE_OFFEN:
AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
break;
case BUFFER_LOAD_OFFSET:
+ case BUFFER_STORE_OFFSET:
AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
break;
@@ -680,6 +685,90 @@ MachineBasicBlock::iterator SILoadStoreO
return Next;
}
+unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
+ const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
+ IsX2 = false;
+ IsOffen = false;
+
+ switch (I.getOpcode()) {
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
+ IsX2 = true;
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
+ IsX2 = true;
+ IsOffen = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
+ IsX2 = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
+ IsX2 = true;
+ return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
+ }
+ return 0;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
+ CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ bool Unused1, Unused2;
+ unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
+
+ unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+ // Handle descending offsets
+ if (CI.Offset0 > CI.Offset1)
+ std::swap(SubRegIdx0, SubRegIdx1);
+
+ // Copy to the new source register.
+ const TargetRegisterClass *SuperRC =
+ CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
+
+ const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+ .add(*Src0)
+ .addImm(SubRegIdx0)
+ .add(*Src1)
+ .addImm(SubRegIdx1);
+
+ auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
+ .addReg(SrcReg, RegState::Kill);
+
+ if (CI.InstClass == BUFFER_STORE_OFFEN)
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+ moveInstsAfter(MIB, CI.InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(CI.I);
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return Next;
+}
+
// Scan through looking for adjacent LDS operations with constant offsets from
// the same base register. We rely on the scheduler to do the hard work of
// clustering nearby loads, and assume these are all adjacent.
@@ -758,6 +847,22 @@ bool SILoadStoreOptimizer::optimizeBlock
if (!CI.IsX2)
CreatedX2++;
} else {
+ ++I;
+ }
+ continue;
+ }
+
+ bool StoreIsX2, IsOffen;
+ if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
+ CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
+ CI.EltSize = 4;
+ CI.IsX2 = StoreIsX2;
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ I = mergeBufferStorePair(CI);
+ if (!CI.IsX2)
+ CreatedX2++;
+ } else {
++I;
}
continue;
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll?rev=317755&r1=317754&r2=317755&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll Wed Nov 8 17:52:55 2017
@@ -95,6 +95,81 @@ main_body:
ret void
}
+;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged:
+;CHECK-NOT: s_waitcnt
+;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
+define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 8
+ %a3 = add i32 %a, 12
+ %a4 = add i32 %a, 16
+ %a5 = add i32 %a, 28
+ %a6 = add i32 %a, 32
+ call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc:
+;CHECK-NOT: s_waitcnt
+;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
+;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
+;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
+define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 8
+ %a3 = add i32 %a, 12
+ %a4 = add i32 %a, 16
+ %a5 = add i32 %a, 28
+ %a6 = add i32 %a, 32
+ call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1)
+ call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
+define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
+ %a1 = add i32 %a, 4
+ %a2 = add i32 %a, 12
+ call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged:
+;CHECK-NOT: s_waitcnt
+;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
+;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
+define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+ call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
+define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) {
+ call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
+ ret void
+}
+
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
Modified: llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll?rev=317755&r1=317754&r2=317755&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll Wed Nov 8 17:52:55 2017
@@ -237,8 +237,7 @@ define amdgpu_kernel void @merge_global_
; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
; GCN: buffer_load_dwordx2 v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
+; GCN: buffer_store_dwordx2 v
define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -379,10 +378,7 @@ define amdgpu_kernel void @merge_global_
; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
; GCN: buffer_load_dwordx4 v
; GCN: s_barrier
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
+; GCN: buffer_store_dwordx4 v
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
More information about the llvm-commits
mailing list