[llvm] r317751 - AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Marek Olsak via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 8 17:52:23 PST 2017
Author: mareko
Date: Wed Nov 8 17:52:23 2017
New Revision: 317751
URL: http://llvm.org/viewvc/llvm-project?rev=317751&view=rev
Log:
AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4
Summary:
Only constant offsets (*_IMM opcodes) are merged.
It reuses code for LDS load/store merging.
It relies on the scheduler to group loads.
The results are mixed, I think they are mostly positive. Most shaders are
affected, so here are total stats only:
SGPRS: 2072198 -> 2151462 (3.83 %)
VGPRS: 1628024 -> 1634612 (0.40 %)
Spilled SGPRs: 7883 -> 8942 (13.43 %)
Spilled VGPRs: 97 -> 101 (4.12 %)
Scratch size: 1488 -> 1492 (0.27 %) dwords per thread
Code Size: 60222620 -> 52940672 (-12.09 %) bytes
Max Waves: 374337 -> 373066 (-0.34 %)
There is 13.4% increase in SGPR spilling, DiRT Showdown spills a few more
VGPRs (now 37), but 12% decrease in code size.
These are the new stats for SGPR spilling. We already spill a lot SGPRs,
so it's uncertain whether more spilling will make any difference since
SGPRs are always spilled to VGPRs:
SGPR SPILLING APPS Shaders SpillSGPR AvgPerSh
alien_isolation 2938 100 0.0
batman_arkham_origins 589 6 0.0
bioshock-infinite 1769 4 0.0
borderlands2 3968 22 0.0
counter_strike_glob.. 1142 60 0.1
deus_ex_mankind_div.. 1410 79 0.1
dirt-showdown 533 4 0.0
dirt_rally 364 1163 3.2
divinity 1052 2 0.0
dota2 1747 7 0.0
f1-2015 776 1515 2.0
grid_autosport 1767 1505 0.9
hitman 1413 273 0.2
left_4_dead_2 1762 4 0.0
life_is_strange 1296 26 0.0
mad_max 358 96 0.3
metro_2033_redux 2670 60 0.0
payday2 1362 22 0.0
portal 474 3 0.0
saints_row_iv 1704 8 0.0
serious_sam_3_bfe 392 1348 3.4
shadow_of_mordor 1418 12 0.0
shadow_warrior 3956 239 0.1
talos_principle 324 1735 5.4
thea 172 17 0.1
tomb_raider 1449 215 0.1
total_war_warhammer 242 56 0.2
ue4_effects_cave 295 55 0.2
ue4_elemental 572 12 0.0
unigine_tropics 210 56 0.3
unigine_valley 278 152 0.5
victor_vran 1262 84 0.1
yofrankie 82 2 0.0
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D38949
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=317751&r1=317750&r2=317751&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h Wed Nov 8 17:52:23 2017
@@ -323,6 +323,14 @@ public:
return HasMadMixInsts;
}
+ bool hasSBufferLoadStoreAtomicDwordxN() const {
+ // Only use the "x1" variants on GFX9 or don't use the buffer variants.
+ // For x2 and higher variants, if the accessed region spans 2 VM pages and
+ // the second page is unmapped, the hw hangs.
+ // TODO: There is one future GFX9 chip that doesn't have this bug.
+ return getGeneration() != GFX9;
+ }
+
bool hasCARRY() const {
return (getGeneration() >= EVERGREEN);
}
Modified: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp?rev=317751&r1=317750&r2=317751&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp Wed Nov 8 17:52:23 2017
@@ -14,6 +14,12 @@
// ==>
// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
//
+// The same is done for certain SMEM opcodes, e.g.:
+// s_buffer_load_dword s4, s[0:3], 4
+// s_buffer_load_dword s5, s[0:3], 8
+// ==>
+// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
+//
//
// Future improvements:
//
@@ -76,23 +82,28 @@ class SILoadStoreOptimizer : public Mach
unsigned Offset0;
unsigned Offset1;
unsigned BaseOff;
+ bool GLC0;
+ bool GLC1;
bool UseST64;
+ bool IsSBufferLoadImm;
+ bool IsX2;
SmallVector<MachineInstr*, 8> InstsToMove;
};
private:
+ const SISubtarget *STM = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
MachineRegisterInfo *MRI = nullptr;
AliasAnalysis *AA = nullptr;
+ unsigned CreatedX2;
static bool offsetsCanBeCombined(CombineInfo &CI);
- bool findMatchingDSInst(CombineInfo &CI);
-
+ bool findMatchingInst(CombineInfo &CI);
MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
-
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
public:
static char ID;
@@ -210,6 +221,14 @@ bool SILoadStoreOptimizer::offsetsCanBeC
CI.UseST64 = false;
CI.BaseOff = 0;
+ // SMEM offsets must be consecutive.
+ if (CI.IsSBufferLoadImm) {
+ unsigned Diff = CI.IsX2 ? 2 : 1;
+ return (EltOffset0 + Diff == EltOffset1 ||
+ EltOffset1 + Diff == EltOffset0) &&
+ CI.GLC0 == CI.GLC1;
+ }
+
// If the offset in elements doesn't fit in 8-bits, we might be able to use
// the stride 64 versions.
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
@@ -247,13 +266,18 @@ bool SILoadStoreOptimizer::offsetsCanBeC
return false;
}
-bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
+bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = CI.I;
- int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
- AMDGPU::OpName::addr);
+ unsigned AddrOpName;
+ if (CI.IsSBufferLoadImm)
+ AddrOpName = AMDGPU::OpName::sbase;
+ else
+ AddrOpName = AMDGPU::OpName::addr;
+
+ int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName);
const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
// We only ever merge operations with the same base address register, so don't
@@ -319,10 +343,18 @@ bool SILoadStoreOptimizer::findMatchingD
AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
AMDGPU::OpName::offset);
- CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff;
- CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+ CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
+ CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
CI.Paired = MBBI;
+ if (CI.IsSBufferLoadImm) {
+ CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
+ CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
+ } else {
+ CI.Offset0 &= 0xffff;
+ CI.Offset1 &= 0xffff;
+ }
+
// Check both offsets fit in the reduced range.
// We also need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
@@ -488,6 +520,51 @@ MachineBasicBlock::iterator SILoadStoreO
return Next;
}
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
+ CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
+ AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+
+ const TargetRegisterClass *SuperRC =
+ CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
+ unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
+
+ BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
+ unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+
+ // Handle descending offsets
+ if (CI.Offset0 > CI.Offset1)
+ std::swap(SubRegIdx0, SubRegIdx1);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
+ const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
+
+ BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ moveInstsAfter(Copy1, CI.InstsToMove);
+
+ MachineBasicBlock::iterator Next = std::next(CI.I);
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return Next;
+}
+
// Scan through looking for adjacent LDS operations with constant offsets from
// the same base register. We rely on the scheduler to do the hard work of
// clustering nearby loads, and assume these are all adjacent.
@@ -505,10 +582,11 @@ bool SILoadStoreOptimizer::optimizeBlock
CombineInfo CI;
CI.I = I;
+ CI.IsSBufferLoadImm = false;
unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
- if (findMatchingDSInst(CI)) {
+ if (findMatchingInst(CI)) {
Modified = true;
I = mergeRead2Pair(CI);
} else {
@@ -516,9 +594,10 @@ bool SILoadStoreOptimizer::optimizeBlock
}
continue;
- } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+ }
+ if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
- if (findMatchingDSInst(CI)) {
+ if (findMatchingInst(CI)) {
Modified = true;
I = mergeWrite2Pair(CI);
} else {
@@ -527,6 +606,23 @@ bool SILoadStoreOptimizer::optimizeBlock
continue;
}
+ if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
+ (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+ Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
+ // EltSize is in units of the offset encoding.
+ CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
+ CI.IsSBufferLoadImm = true;
+ CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ I = mergeSBufferLoadImmPair(CI);
+ if (!CI.IsX2)
+ CreatedX2++;
+ } else {
+ ++I;
+ }
+ continue;
+ }
++I;
}
@@ -538,11 +634,11 @@ bool SILoadStoreOptimizer::runOnMachineF
if (skipFunction(*MF.getFunction()))
return false;
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
- if (!STM.loadStoreOptEnabled())
+ STM = &MF.getSubtarget<SISubtarget>();
+ if (!STM->loadStoreOptEnabled())
return false;
- TII = STM.getInstrInfo();
+ TII = STM->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
@@ -553,9 +649,16 @@ bool SILoadStoreOptimizer::runOnMachineF
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
bool Modified = false;
+ CreatedX2 = 0;
for (MachineBasicBlock &MBB : MF)
Modified |= optimizeBlock(MBB);
+ // Run again to convert x2 to x4.
+ if (CreatedX2 >= 1) {
+ for (MachineBasicBlock &MBB : MF)
+ Modified |= optimizeBlock(MBB);
+ }
+
return Modified;
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/smrd.ll?rev=317751&r1=317750&r2=317751&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/smrd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/smrd.ll Wed Nov 8 17:52:23 2017
@@ -1,11 +1,12 @@
-; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s
; SMRD load with an immediate offset.
; GCN-LABEL: {{^}}smrd0:
; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
+; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
entry:
%tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
@@ -17,7 +18,7 @@ entry:
; SMRD load with the largest possible immediate offset.
; GCN-LABEL: {{^}}smrd1:
; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
entry:
%tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
@@ -31,7 +32,7 @@ entry:
; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
; GCN: s_endpgm
define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
entry:
@@ -61,7 +62,7 @@ entry:
; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
-; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
entry:
%tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
@@ -72,8 +73,8 @@ entry:
; SMRD load with an offset greater than the largest possible immediate on VI
; GCN-LABEL: {{^}}smrd5:
-; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
-; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; SIVIGFX9: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
+; SIVIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
; GCN: s_endpgm
define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
@@ -104,7 +105,7 @@ main_body:
; SMRD load using the load.const.v4i32 intrinsic with an immediate offset
; GCN-LABEL: {{^}}smrd_load_const0:
; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
@@ -118,7 +119,7 @@ main_body:
; offset.
; GCN-LABEL: {{^}}smrd_load_const1:
; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
@@ -135,7 +136,7 @@ main_body:
; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
@@ -150,7 +151,7 @@ main_body:
; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
-; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
@@ -162,8 +163,8 @@ main_body:
; SMRD load with an offset greater than the largest possible immediate on VI
; GCN-LABEL: {{^}}smrd_load_const4:
-; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
-; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; SIVIGFX9: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
+; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
; GCN: s_endpgm
define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
@@ -212,6 +213,31 @@ main_body:
ret float %r
}
+; GCN-LABEL: {{^}}smrd_imm_merged:
+; GCN-NEXT: BB#
+; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1
+; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7
+; VI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4
+; VI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 {
+main_body:
+ %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4)
+ %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 8)
+ %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 12)
+ %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 16)
+ %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 28)
+ %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 32)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
+ ret void
+}
+
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
More information about the llvm-commits
mailing list