[llvm] 5f7ea85 - [AMDGPU] Remove unnecessary s_waitcnt between VMEM loads
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri May 1 02:38:48 PDT 2020
Author: Jay Foad
Date: 2020-05-01T10:10:23+01:00
New Revision: 5f7ea85e789d5b5f3f463e538a28c040e373620b
URL: https://github.com/llvm/llvm-project/commit/5f7ea85e789d5b5f3f463e538a28c040e373620b
DIFF: https://github.com/llvm/llvm-project/commit/5f7ea85e789d5b5f3f463e538a28c040e373620b.diff
LOG: [AMDGPU] Remove unnecessary s_waitcnt between VMEM loads
VMEM loads of the same type (sampler vs no sampler) are guaranteed to
write their result registers in order, so there is no need for an
s_waitcnt even if they write to overlapping vgprs.
Differential Revision: https://reviews.llvm.org/D79176
Added:
llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
llvm/test/CodeGen/AMDGPU/shl.ll
llvm/test/CodeGen/AMDGPU/wait.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b55c6081fd6a..c115d26fa6a3 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -164,6 +164,28 @@ enum RegisterMapping {
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
};
+// Enumerate
diff erent types of result-returning VMEM operations. Although
+// s_waitcnt orders them all with a single vmcnt counter, in the absence of
+// s_waitcnt only instructions of the same VmemType are guaranteed to write
+// their results in order -- so there is no need to insert an s_waitcnt between
+// two instructions of the same type that write the same vgpr.
+enum VmemType {
+ // BUF instructions and MIMG instructions without a sampler.
+ VMEM_NOSAMPLER,
+ // MIMG instructions with a sampler.
+ VMEM_SAMPLER,
+};
+
+VmemType getVmemType(const MachineInstr &Inst) {
+ assert(SIInstrInfo::isVMEM(Inst));
+ if (!SIInstrInfo::isMIMG(Inst))
+ return VMEM_NOSAMPLER;
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
+ return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
+ ? VMEM_SAMPLER
+ : VMEM_NOSAMPLER;
+}
+
void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
switch (T) {
case VM_CNT:
@@ -281,6 +303,18 @@ class WaitcntBrackets {
LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
}
+ // Return true if there might be pending writes to the specified vgpr by VMEM
+ // instructions with types
diff erent from V.
+ bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
+ assert(GprNo < NUM_ALL_VGPRS);
+ return VgprVmemTypes[GprNo] & ~(1 << V);
+ }
+
+ void clearVgprVmemTypes(int GprNo) {
+ assert(GprNo < NUM_ALL_VGPRS);
+ VgprVmemTypes[GprNo] = 0;
+ }
+
void print(raw_ostream &);
void dump() { print(dbgs()); }
@@ -337,6 +371,9 @@ class WaitcntBrackets {
unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+ // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+ // write to each vgpr.
+ unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
};
class SIInsertWaitcnts : public MachineFunctionPass {
@@ -617,8 +654,15 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
if (!Op.isReg() || !Op.isDef())
continue;
RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
- if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
- continue;
+ if (T == VM_CNT) {
+ if (Interval.first >= NUM_ALL_VGPRS)
+ continue;
+ if (SIInstrInfo::isVMEM(Inst)) {
+ VmemType V = getVmemType(Inst);
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
+ VgprVmemTypes[RegNo] |= 1 << V;
+ }
+ }
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, T, CurrScore);
}
@@ -982,8 +1026,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (TRI->isVGPR(*MRI, Op.getReg())) {
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
+ // previous write and this write are the same type of VMEM
+ // instruction, in which case they're guaranteed to write their
+ // results in order anyway.
+ if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
+ ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
+ getVmemType(MI))) {
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ ScoreBrackets.clearVgprVmemTypes(RegNo);
+ }
if (Op.isDef()) {
ScoreBrackets.determineWait(
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
@@ -1296,6 +1349,14 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
}
+ if (T == VM_CNT) {
+ for (int J = 0; J <= VgprUB; J++) {
+ unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
+ RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
+ VgprVmemTypes[J] = NewVmemTypes;
+ }
+ }
+
if (T == LGKM_CNT) {
for (int J = 0; J <= SgprUB; J++) {
RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 64206d452280..5cbcba17931b 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -16,7 +16,7 @@
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
; W64: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[CMP]]
; W64: s_cbranch_execnz [[LOOPBB]]
@@ -34,7 +34,7 @@
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
; W32: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
; W32: s_cbranch_execnz [[LOOPBB]]
@@ -59,7 +59,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
; W64: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[CMP]]
; W64: s_cbranch_execnz [[LOOPBB0]]
@@ -77,7 +77,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
; W64: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[CMP]]
; W64: s_cbranch_execnz [[LOOPBB1]]
@@ -99,7 +99,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
; W32: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
; W32: s_cbranch_execnz [[LOOPBB0]]
@@ -117,7 +117,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
; W32: s_cbranch_execnz [[LOOPBB1]]
@@ -150,7 +150,7 @@ entry:
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[CMP]]
; W64: s_cbranch_execnz [[LOOPBB0]]
@@ -171,7 +171,7 @@ entry:
; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
; W64: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W64: s_xor_b64 exec, exec, [[CMP]]
; W64: s_cbranch_execnz [[LOOPBB1]]
@@ -196,7 +196,7 @@ entry:
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
; W32: s_cbranch_execnz [[LOOPBB0]]
@@ -217,7 +217,7 @@ entry:
; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
; W32: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
; W32: s_cbranch_execnz [[LOOPBB1]]
@@ -240,11 +240,8 @@ entry:
; W64-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
; W64-O0: s_waitcnt vmcnt(0)
; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
@@ -278,11 +275,8 @@ entry:
; W64-O0: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
; W64-O0: s_waitcnt vmcnt(0)
; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index c96ff256c8c6..da15cff4c91a 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -742,8 +742,8 @@ define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[4:7], 0 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v13
diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll
index dcc7bfa2c01e..8d6864f967e9 100644
--- a/llvm/test/CodeGen/AMDGPU/wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait.ll
@@ -9,7 +9,6 @@
; DEFAULT: s_load_dwordx4
; DEFAULT: s_waitcnt lgkmcnt(0)
; DEFAULT: buffer_load_format_xyzw
-; DEFAULT: s_waitcnt vmcnt(0)
; DEFAULT: buffer_load_format_xyzw
; DEFAULT: s_waitcnt vmcnt(0)
; DEFAULT: exp
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
new file mode 100644
index 000000000000..90009b608428
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
@@ -0,0 +1,70 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
+
+# Two buffer loads with overlapping outputs. No waitcnt required.
+---
+name: buffer_buffer
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
+ ; GFX9-LABEL: name: buffer_buffer
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec
+...
+
+# Two tbuffer loads with overlapping outputs. No waitcnt required.
+---
+name: tbuffer_tbuffer
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
+ ; GFX9-LABEL: name: tbuffer_tbuffer
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec
+...
+
+# Two gathers with overlapping outputs. (Note gathers can't be trimmed because
+# dmask means something
diff erent.) No waitcnt required.
+---
+name: gather_gather
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX9-LABEL: name: gather_gather
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+...
+
+# Image load vs image sample. Waitcnt required because they are not guaranteed
+# to write their results in order, despite both using the s_waitcnt vmcnt
+# counter.
+---
+name: nosampler_sampler
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX9-LABEL: name: nosampler_sampler
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX9: S_WAITCNT 0
+ ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_WAITCNT 3952
+ ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec
+ $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec
+...
More information about the llvm-commits
mailing list