[llvm] 5f7ea85 - [AMDGPU] Remove unnecessary s_waitcnt between VMEM loads

Fri May 1 02:38:48 PDT 2020

Author: Jay Foad
Date: 2020-05-01T10:10:23+01:00
New Revision: 5f7ea85e789d5b5f3f463e538a28c040e373620b

URL: https://github.com/llvm/llvm-project/commit/5f7ea85e789d5b5f3f463e538a28c040e373620b
DIFF: https://github.com/llvm/llvm-project/commit/5f7ea85e789d5b5f3f463e538a28c040e373620b.diff

LOG: [AMDGPU] Remove unnecessary s_waitcnt between VMEM loads

VMEM loads of the same type (sampler vs no sampler) are guaranteed to
write their result registers in order, so there is no need for an
s_waitcnt even if they write to overlapping vgprs.

Differential Revision: https://reviews.llvm.org/D79176

Added: 
    llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
    llvm/test/CodeGen/AMDGPU/shl.ll
    llvm/test/CodeGen/AMDGPU/wait.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b55c6081fd6a..c115d26fa6a3 100644

--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -164,6 +164,28 @@ enum RegisterMapping {
   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
 };
 
+// Enumerate 
diff erent types of result-returning VMEM operations. Although
+// s_waitcnt orders them all with a single vmcnt counter, in the absence of
+// s_waitcnt only instructions of the same VmemType are guaranteed to write
+// their results in order -- so there is no need to insert an s_waitcnt between
+// two instructions of the same type that write the same vgpr.
+enum VmemType {
+  // BUF instructions and MIMG instructions without a sampler.
+  VMEM_NOSAMPLER,
+  // MIMG instructions with a sampler.
+  VMEM_SAMPLER,
+};
+
+VmemType getVmemType(const MachineInstr &Inst) {
+  assert(SIInstrInfo::isVMEM(Inst));
+  if (!SIInstrInfo::isMIMG(Inst))
+    return VMEM_NOSAMPLER;
+  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
+  return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
+             ? VMEM_SAMPLER
+             : VMEM_NOSAMPLER;
+}
+
 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
   switch (T) {
   case VM_CNT:
@@ -281,6 +303,18 @@ class WaitcntBrackets {
     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
   }
 
+  // Return true if there might be pending writes to the specified vgpr by VMEM
+  // instructions with types 
diff erent from V.
+  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
+    assert(GprNo < NUM_ALL_VGPRS);
+    return VgprVmemTypes[GprNo] & ~(1 << V);
+  }
+
+  void clearVgprVmemTypes(int GprNo) {
+    assert(GprNo < NUM_ALL_VGPRS);
+    VgprVmemTypes[GprNo] = 0;
+  }
+
   void print(raw_ostream &);
   void dump() { print(dbgs()); }
 
@@ -337,6 +371,9 @@ class WaitcntBrackets {
   unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
   unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+  // write to each vgpr.
+  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
 };
 
 class SIInsertWaitcnts : public MachineFunctionPass {
@@ -617,8 +654,15 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       if (!Op.isReg() || !Op.isDef())
         continue;
       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
-      if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
-        continue;
+      if (T == VM_CNT) {
+        if (Interval.first >= NUM_ALL_VGPRS)
+          continue;
+        if (SIInstrInfo::isVMEM(Inst)) {
+          VmemType V = getVmemType(Inst);
+          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
+            VgprVmemTypes[RegNo] |= 1 << V;
+        }
+      }
       for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         setRegScore(RegNo, T, CurrScore);
       }
@@ -982,8 +1026,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
             ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
         for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
           if (TRI->isVGPR(*MRI, Op.getReg())) {
-            ScoreBrackets.determineWait(
-                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+            // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
+            // previous write and this write are the same type of VMEM
+            // instruction, in which case they're guaranteed to write their
+            // results in order anyway.
+            if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
+                ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
+                                                       getVmemType(MI))) {
+              ScoreBrackets.determineWait(
+                  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+              ScoreBrackets.clearVgprVmemTypes(RegNo);
+            }
             if (Op.isDef()) {
               ScoreBrackets.determineWait(
                   EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
@@ -1296,6 +1349,14 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
       RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
     }
 
+    if (T == VM_CNT) {
+      for (int J = 0; J <= VgprUB; J++) {
+        unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
+        RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
+        VgprVmemTypes[J] = NewVmemTypes;
+      }
+    }
+
     if (T == LGKM_CNT) {
       for (int J = 0; J <= SgprUB; J++) {
         RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 64206d452280..5cbcba17931b 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -16,7 +16,7 @@
 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
 ; W64: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W64: s_xor_b64 exec, exec, [[CMP]]
 ; W64: s_cbranch_execnz [[LOOPBB]]
@@ -34,7 +34,7 @@
 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
 ; W32: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
 ; W32: s_cbranch_execnz [[LOOPBB]]
@@ -59,7 +59,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
 ; W64: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W64: s_xor_b64 exec, exec, [[CMP]]
 ; W64: s_cbranch_execnz [[LOOPBB0]]
@@ -77,7 +77,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
 ; W64: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W64: s_xor_b64 exec, exec, [[CMP]]
 ; W64: s_cbranch_execnz [[LOOPBB1]]
@@ -99,7 +99,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
 ; W32: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
 ; W32: s_cbranch_execnz [[LOOPBB0]]
@@ -117,7 +117,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
 ; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
 ; W32: s_cbranch_execnz [[LOOPBB1]]
@@ -150,7 +150,7 @@ entry:
 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
 ; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W64: s_xor_b64 exec, exec, [[CMP]]
 ; W64: s_cbranch_execnz [[LOOPBB0]]
@@ -171,7 +171,7 @@ entry:
 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
-; W64: s_waitcnt vmcnt(0)
+; W64: s_nop 0
 ; W64: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W64: s_xor_b64 exec, exec, [[CMP]]
 ; W64: s_cbranch_execnz [[LOOPBB1]]
@@ -196,7 +196,7 @@ entry:
 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
 ; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
 ; W32: s_cbranch_execnz [[LOOPBB0]]
@@ -217,7 +217,7 @@ entry:
 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
-; W32: s_waitcnt vmcnt(0)
+; W32: s_nop 0
 ; W32: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
 ; W32: s_cbranch_execnz [[LOOPBB1]]
@@ -240,11 +240,8 @@ entry:
 
 ; W64-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
 ; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
 ; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
@@ -278,11 +275,8 @@ entry:
 
 ; W64-O0: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
 ; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
 ; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]

diff  --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index c96ff256c8c6..da15cff4c91a 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -742,8 +742,8 @@ define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
 ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; GCN-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_load_dwordx4 v[11:14], off, s[4:7], 0 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], v10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshl_b64 v[6:7], v[6:7], v13

diff  --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll
index dcc7bfa2c01e..8d6864f967e9 100644
--- a/llvm/test/CodeGen/AMDGPU/wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait.ll
@@ -9,7 +9,6 @@
 ; DEFAULT: s_load_dwordx4
 ; DEFAULT: s_waitcnt lgkmcnt(0)
 ; DEFAULT: buffer_load_format_xyzw
-; DEFAULT: s_waitcnt vmcnt(0)
 ; DEFAULT: buffer_load_format_xyzw
 ; DEFAULT: s_waitcnt vmcnt(0)
 ; DEFAULT: exp

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
new file mode 100644
index 000000000000..90009b608428
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
@@ -0,0 +1,70 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
+
+# Two buffer loads with overlapping outputs. No waitcnt required.
+---
+name: buffer_buffer
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
+    ; GFX9-LABEL: name: buffer_buffer
+    ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
+    ; GFX9: S_WAITCNT 0
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec
+...
+
+# Two tbuffer loads with overlapping outputs. No waitcnt required.
+---
+name: tbuffer_tbuffer
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
+    ; GFX9-LABEL: name: tbuffer_tbuffer
+    ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
+    ; GFX9: S_WAITCNT 0
+    ; GFX9: $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec
+...
+
+# Two gathers with overlapping outputs. (Note gathers can't be trimmed because
+# dmask means something 
diff erent.) No waitcnt required.
+---
+name: gather_gather
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+    ; GFX9-LABEL: name: gather_gather
+    ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+    ; GFX9: S_WAITCNT 0
+    ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+...
+
+# Image load vs image sample. Waitcnt required because they are not guaranteed
+# to write their results in order, despite both using the s_waitcnt vmcnt
+# counter.
+---
+name: nosampler_sampler
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX9-LABEL: name: nosampler_sampler
+    ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GFX9: S_WAITCNT 0
+    ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GFX9: S_WAITCNT 3952
+    ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec
+    $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec
+...