[llvm] 5bf2a9d - [AMDGPU] Update VMEM scalar write hazard mitigation sequence

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 15 19:38:56 PDT 2020


Author: Carl Ritson
Date: 2020-07-16T11:37:45+09:00
New Revision: 5bf2a9dd40dbba6dacbbe61f843d4d3a6f54f294

URL: https://github.com/llvm/llvm-project/commit/5bf2a9dd40dbba6dacbbe61f843d4d3a6f54f294
DIFF: https://github.com/llvm/llvm-project/commit/5bf2a9dd40dbba6dacbbe61f843d4d3a6f54f294.diff

LOG: [AMDGPU] Update VMEM scalar write hazard mitigation sequence

Using s_waitcnt_depctr 0xffe3 is potentially faster than v_nop.

Reviewed By: rampitec, foad

Differential Revision: https://reviews.llvm.org/D83872

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
    llvm/test/CodeGen/AMDGPU/cc-update.ll
    llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
    llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
    llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
    llvm/test/CodeGen/AMDGPU/wave32.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8482dbfec250..222923187081 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -930,10 +930,12 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
     return false;
   };
 
-  auto IsExpiredFn = [] (MachineInstr *MI, int) {
+  auto IsExpiredFn = [](MachineInstr *MI, int) {
     return MI && (SIInstrInfo::isVALU(*MI) ||
                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
-                   !MI->getOperand(0).getImm()));
+                   !MI->getOperand(0).getImm()) ||
+                  (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+                   MI->getOperand(0).getImm() == 0xffe3));
   };
 
   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -941,7 +943,9 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
     return false;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
-  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+      .addImm(0xffe3);
   return true;
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 2a3034763087..7b375641f729 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1075,7 +1075,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10_W32-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10_W32-NEXT:  BB13_2: ; %exit
-; GFX10_W32-NEXT:    v_nop
+; GFX10_W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
@@ -1113,7 +1113,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10_W64-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10_W64-NEXT:  BB13_2: ; %exit
-; GFX10_W64-NEXT:    v_nop
+; GFX10_W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10_W64-NEXT:    s_and_b32 s0, 1, s6
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 28c7d47e855f..e38df28d23d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -18,7 +18,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:  BB0_2: ; %bb
-; GCN-NEXT:    v_nop
+; GCN-NEXT:    s_waitcnt_depctr 0xffe3
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 284da9da36ee..a13320bea7a1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -124,7 +124,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB0_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
@@ -156,7 +156,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB0_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
@@ -298,7 +298,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB1_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
@@ -334,7 +334,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB1_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
@@ -520,7 +520,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB2_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -572,7 +572,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB2_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -759,7 +759,7 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB3_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -811,7 +811,7 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB3_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -998,7 +998,7 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB4_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -1050,7 +1050,7 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB4_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -1194,7 +1194,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB5_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
@@ -1228,7 +1228,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB5_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
@@ -1406,7 +1406,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB6_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
@@ -1449,7 +1449,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB6_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
@@ -1675,7 +1675,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB8_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
@@ -1708,7 +1708,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB8_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
@@ -1851,7 +1851,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB9_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mul_lo_u32 v0, s0, v0
@@ -1887,7 +1887,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB9_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
@@ -2073,7 +2073,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB10_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -2125,7 +2125,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB10_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -2271,7 +2271,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB11_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
@@ -2307,7 +2307,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB11_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
@@ -2487,7 +2487,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB12_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
@@ -2530,7 +2530,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB12_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
@@ -2808,7 +2808,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB14_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -2859,7 +2859,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB14_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -3046,7 +3046,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB15_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -3098,7 +3098,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB15_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -3285,7 +3285,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB16_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB16_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -3521,7 +3521,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB17_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -3572,7 +3572,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB17_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -3719,7 +3719,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB18_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
@@ -3754,7 +3754,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB18_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
@@ -3941,7 +3941,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB19_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -3992,7 +3992,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB19_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -4139,7 +4139,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB20_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
@@ -4174,7 +4174,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB20_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
@@ -4364,7 +4364,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB21_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -4416,7 +4416,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB21_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB22_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
@@ -4595,7 +4595,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB22_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
@@ -4782,7 +4782,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB23_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, v1
@@ -4833,7 +4833,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB23_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, v1
@@ -4977,7 +4977,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:  BB24_2:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
@@ -5012,7 +5012,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:  BB24_2:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index a4094573f8ab..b9ad02a77bdf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -130,7 +130,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
 ; GFX1064-NEXT:    v_mul_u32_u24_e64 v1, s12, 5
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX1064-NEXT:  BB0_3:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[30:31]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
@@ -164,7 +164,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
 ; GFX1032-NEXT:    v_mul_u32_u24_e64 v1, s10, 5
 ; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX1032-NEXT:  BB0_3:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
@@ -364,7 +364,7 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
 ; GFX1064-NEXT:  BB1_3:
-; GFX1064-NEXT:    v_nop
+; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[30:31]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
@@ -418,7 +418,7 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
 ; GFX1032-NEXT:  BB1_3:
-; GFX1032-NEXT:    v_nop
+; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index dcd3b63f10ce..a3727cee1570 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -386,7 +386,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
 ; GFX1010-NEXT:    ; implicit-def: $vcc_hi
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX1010-NEXT:    v_nop
+; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
 ; GFX1010-NEXT:    ;;#ASMSTART
 ; GFX1010-NEXT:    ;;#ASMEND

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index 384cb1b4699d..85859fb61eb4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -29,7 +29,7 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 {
 ; GCN-NEXT:    s_and_saveexec_b32 s4, s4
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    buffer_store_dword v0, v0, s[8:11], 0 offen
-; GCN-NEXT:    v_nop
+; GCN-NEXT:    s_waitcnt_depctr 0xffe3
 ; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GCN-NEXT:    s_cbranch_execnz BB0_2
 ; GCN-NEXT:  ; %bb.3: ; in Loop: Header=BB0_1 Depth=1

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 4dd9efa9c008..432b016a2b59 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -59,7 +59,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT: ;;#ASMEND
 
 ; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1
-; GFX10-NEXT: v_nop
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+4

diff  --git a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
index 6ae620b8ad24..165ebcc6d135 100644
--- a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
@@ -2,7 +2,7 @@
 
 # GCN-LABEL: name: vmem_write_sgpr
 # GCN:      BUFFER_LOAD_DWORD_OFFEN
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_write_sgpr
@@ -16,7 +16,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_write_exec
 # GCN:      BUFFER_STORE_DWORD_OFFEN_exact
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_write_exec
@@ -35,7 +35,7 @@ body:             |
 # GCN-NEXT: S_MOV_B32
 # GCN-NEXT: S_MOV_B32
 # GCN-NEXT: S_MOV_B32
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_write_sgpr_chain
@@ -54,7 +54,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_smem_write_sgpr
 # GCN:      BUFFER_LOAD_DWORD_OFFEN
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_LOAD_DWORD_IMM
 ---
 name:            vmem_smem_write_sgpr
@@ -69,7 +69,7 @@ body:             |
 # GCN-LABEL: name: vmem_snop_write_sgpr
 # GCN:      BUFFER_LOAD_DWORD_OFFEN
 # GCN-NEXT: S_NOP
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_snop_write_sgpr
@@ -115,7 +115,7 @@ body:             |
 # GCN-LABEL: name: vmem_swait_any_write_sgpr
 # GCN:      BUFFER_LOAD_DWORD_OFFEN
 # GCN-NEXT: S_WAITCNT
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_swait_any_write_sgpr
@@ -130,7 +130,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_write_exec_impread
 # GCN:      BUFFER_LOAD_DWORD_OFFEN
-# GCN:      V_NOP
+# GCN:      S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B64
 ---
 name:            vmem_write_exec_impread
@@ -144,7 +144,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_write_exec_expread
 # GCN:      BUFFER_LOAD_DWORD_OFFEN
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B64
 ---
 name:            vmem_write_exec_expread
@@ -157,7 +157,7 @@ body:             |
 ...
 # GCN-LABEL: name: ds_write_m0
 # GCN:      DS_READ_B32
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            ds_write_m0
@@ -171,7 +171,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_write_sgpr_fall_through
 # GCN:      BUFFER_LOAD_DWORD_OFFEN
-# GCN:      V_NOP
+# GCN:      S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_write_sgpr_fall_through
@@ -189,7 +189,7 @@ body:             |
 # GCN-LABEL: name: vmem_write_sgpr_branch
 # GCN:      BUFFER_LOAD_DWORD_OFFEN
 # GCN-NEXT: S_BRANCH
-# GCN:      V_NOP
+# GCN:      S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_write_sgpr_branch
@@ -209,7 +209,7 @@ body:             |
 # GCN:      BUFFER_LOAD_DWORD_OFFEN
 # GCN-NEXT: S_BRANCH
 # GCN:      bb.2:
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_write_sgpr_branch_around
@@ -237,7 +237,7 @@ body:             |
 # GCN:      S_WAITCNT
 # GCN:      V_ADD_I32
 # GCN:      bb.2:
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_write_sgpr_cbranch_around
@@ -262,7 +262,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_write_sgpr_branch_backedge
 # GCN:      $vgpr0 = IMPLICIT_DEF
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_write_sgpr_branch_backedge
@@ -280,7 +280,7 @@ body:             |
 ...
 # GCN-LABEL: name: ds_write_exec
 # GCN:      DS_WRITE_B32_gfx9
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            ds_write_exec
@@ -293,7 +293,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_scratch_exec
 # GCN:      SCRATCH_LOAD_DWORD
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_scratch_exec
@@ -305,7 +305,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_flat_exec
 # GCN:      FLAT_LOAD_DWORD
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_flat_exec
@@ -318,7 +318,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_global_exec
 # GCN:      GLOBAL_LOAD_DWORD
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_global_exec
@@ -331,7 +331,7 @@ body:             |
 ...
 # GCN-LABEL: name: vmem_global_atomic_exec
 # GCN: GLOBAL_ATOMIC_ADD_RTN
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
 # GCN-NEXT: S_MOV_B32
 ---
 name:            vmem_global_atomic_exec

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 55557e51b82c..388a75d148bd 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1059,7 +1059,7 @@ declare void @external_void_func_void() #1
 ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}}
 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: v_nop
+; GCN-NEXT: s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
 
@@ -1082,7 +1082,7 @@ declare void @external_void_func_void() #1
 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: v_nop
+; GCN-NEXT: s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)


        


More information about the llvm-commits mailing list