[llvm] 5bf2a9d - [AMDGPU] Update VMEM scalar write hazard mitigation sequence
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 15 19:38:56 PDT 2020
Author: Carl Ritson
Date: 2020-07-16T11:37:45+09:00
New Revision: 5bf2a9dd40dbba6dacbbe61f843d4d3a6f54f294
URL: https://github.com/llvm/llvm-project/commit/5bf2a9dd40dbba6dacbbe61f843d4d3a6f54f294
DIFF: https://github.com/llvm/llvm-project/commit/5bf2a9dd40dbba6dacbbe61f843d4d3a6f54f294.diff
LOG: [AMDGPU] Update VMEM scalar write hazard mitigation sequence
Using s_waitcnt_depctr 0xffe3 is potentially faster than v_nop.
Reviewed By: rampitec, foad
Differential Revision: https://reviews.llvm.org/D83872
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
llvm/test/CodeGen/AMDGPU/cc-update.ll
llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
llvm/test/CodeGen/AMDGPU/wave32.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8482dbfec250..222923187081 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -930,10 +930,12 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
return false;
};
- auto IsExpiredFn = [] (MachineInstr *MI, int) {
+ auto IsExpiredFn = [](MachineInstr *MI, int) {
return MI && (SIInstrInfo::isVALU(*MI) ||
(MI->getOpcode() == AMDGPU::S_WAITCNT &&
- !MI->getOperand(0).getImm()));
+ !MI->getOperand(0).getImm()) ||
+ (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ MI->getOperand(0).getImm() == 0xffe3));
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -941,7 +943,9 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
return false;
const SIInstrInfo *TII = ST.getInstrInfo();
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(0xffe3);
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 2a3034763087..7b375641f729 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1075,7 +1075,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0
; GFX10_W32-NEXT: s_cselect_b32 s4, 1, 0
; GFX10_W32-NEXT: BB13_2: ; %exit
-; GFX10_W32-NEXT: v_nop
+; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10_W32-NEXT: s_and_b32 s0, 1, s4
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
@@ -1113,7 +1113,7 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0
; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0
; GFX10_W64-NEXT: BB13_2: ; %exit
-; GFX10_W64-NEXT: v_nop
+; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX10_W64-NEXT: s_and_b32 s0, 1, s6
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 28c7d47e855f..e38df28d23d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -18,7 +18,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: BB0_2: ; %bb
-; GCN-NEXT: v_nop
+; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 284da9da36ee..a13320bea7a1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -124,7 +124,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB0_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
@@ -156,7 +156,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB0_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
@@ -298,7 +298,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB1_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0
@@ -334,7 +334,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB1_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0
@@ -520,7 +520,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB2_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -572,7 +572,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB2_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -759,7 +759,7 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB3_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -811,7 +811,7 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB3_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -998,7 +998,7 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB4_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -1050,7 +1050,7 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB4_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -1194,7 +1194,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB5_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: v_readfirstlane_b32 s3, v2
@@ -1228,7 +1228,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB5_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: v_readfirstlane_b32 s3, v2
@@ -1406,7 +1406,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB6_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0
@@ -1449,7 +1449,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB6_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0
@@ -1675,7 +1675,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB8_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
@@ -1708,7 +1708,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB8_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
@@ -1851,7 +1851,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB9_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0
@@ -1887,7 +1887,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB9_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0
@@ -2073,7 +2073,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB10_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -2125,7 +2125,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB10_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -2271,7 +2271,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB11_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0
@@ -2307,7 +2307,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB11_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0
@@ -2487,7 +2487,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB12_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0
@@ -2530,7 +2530,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB12_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0
@@ -2808,7 +2808,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB14_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -2859,7 +2859,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB14_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -3046,7 +3046,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB15_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -3098,7 +3098,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB15_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -3285,7 +3285,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB16_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB16_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -3521,7 +3521,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB17_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -3572,7 +3572,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB17_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -3719,7 +3719,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB18_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
@@ -3754,7 +3754,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB18_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
@@ -3941,7 +3941,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB19_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -3992,7 +3992,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB19_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -4139,7 +4139,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB20_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
@@ -4174,7 +4174,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB20_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
@@ -4364,7 +4364,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB21_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -4416,7 +4416,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB21_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB22_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
@@ -4595,7 +4595,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB22_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
@@ -4782,7 +4782,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB23_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
@@ -4833,7 +4833,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB23_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
@@ -4977,7 +4977,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB24_2:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
@@ -5012,7 +5012,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB24_2:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index a4094573f8ab..b9ad02a77bdf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -130,7 +130,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s12, 5
; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
; GFX1064-NEXT: BB0_3:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
@@ -164,7 +164,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s10, 5
; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
; GFX1032-NEXT: BB0_3:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
@@ -364,7 +364,7 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
; GFX1064-NEXT: v_mov_b32_e32 v0, s12
; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
; GFX1064-NEXT: BB1_3:
-; GFX1064-NEXT: v_nop
+; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
@@ -418,7 +418,7 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
; GFX1032-NEXT: v_mov_b32_e32 v0, s10
; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
; GFX1032-NEXT: BB1_3:
-; GFX1032-NEXT: v_nop
+; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index dcd3b63f10ce..a3727cee1570 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -386,7 +386,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
; GFX1010-NEXT: ; implicit-def: $vcc_hi
; GFX1010-NEXT: s_waitcnt vmcnt(0)
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
-; GFX1010-NEXT: v_nop
+; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
; GFX1010-NEXT: s_mov_b32 s6, 0x20000
; GFX1010-NEXT: ;;#ASMSTART
; GFX1010-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index 384cb1b4699d..85859fb61eb4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -29,7 +29,7 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 {
; GCN-NEXT: s_and_saveexec_b32 s4, s4
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen
-; GCN-NEXT: v_nop
+; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; GCN-NEXT: s_cbranch_execnz BB0_2
; GCN-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 4dd9efa9c008..432b016a2b59 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -59,7 +59,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: ;;#ASMEND
; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1
-; GFX10-NEXT: v_nop
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+4
diff --git a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
index 6ae620b8ad24..165ebcc6d135 100644
--- a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir
@@ -2,7 +2,7 @@
# GCN-LABEL: name: vmem_write_sgpr
# GCN: BUFFER_LOAD_DWORD_OFFEN
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_write_sgpr
@@ -16,7 +16,7 @@ body: |
...
# GCN-LABEL: name: vmem_write_exec
# GCN: BUFFER_STORE_DWORD_OFFEN_exact
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_write_exec
@@ -35,7 +35,7 @@ body: |
# GCN-NEXT: S_MOV_B32
# GCN-NEXT: S_MOV_B32
# GCN-NEXT: S_MOV_B32
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_write_sgpr_chain
@@ -54,7 +54,7 @@ body: |
...
# GCN-LABEL: name: vmem_smem_write_sgpr
# GCN: BUFFER_LOAD_DWORD_OFFEN
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_LOAD_DWORD_IMM
---
name: vmem_smem_write_sgpr
@@ -69,7 +69,7 @@ body: |
# GCN-LABEL: name: vmem_snop_write_sgpr
# GCN: BUFFER_LOAD_DWORD_OFFEN
# GCN-NEXT: S_NOP
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_snop_write_sgpr
@@ -115,7 +115,7 @@ body: |
# GCN-LABEL: name: vmem_swait_any_write_sgpr
# GCN: BUFFER_LOAD_DWORD_OFFEN
# GCN-NEXT: S_WAITCNT
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_swait_any_write_sgpr
@@ -130,7 +130,7 @@ body: |
...
# GCN-LABEL: name: vmem_write_exec_impread
# GCN: BUFFER_LOAD_DWORD_OFFEN
-# GCN: V_NOP
+# GCN: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B64
---
name: vmem_write_exec_impread
@@ -144,7 +144,7 @@ body: |
...
# GCN-LABEL: name: vmem_write_exec_expread
# GCN: BUFFER_LOAD_DWORD_OFFEN
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B64
---
name: vmem_write_exec_expread
@@ -157,7 +157,7 @@ body: |
...
# GCN-LABEL: name: ds_write_m0
# GCN: DS_READ_B32
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: ds_write_m0
@@ -171,7 +171,7 @@ body: |
...
# GCN-LABEL: name: vmem_write_sgpr_fall_through
# GCN: BUFFER_LOAD_DWORD_OFFEN
-# GCN: V_NOP
+# GCN: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_write_sgpr_fall_through
@@ -189,7 +189,7 @@ body: |
# GCN-LABEL: name: vmem_write_sgpr_branch
# GCN: BUFFER_LOAD_DWORD_OFFEN
# GCN-NEXT: S_BRANCH
-# GCN: V_NOP
+# GCN: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_write_sgpr_branch
@@ -209,7 +209,7 @@ body: |
# GCN: BUFFER_LOAD_DWORD_OFFEN
# GCN-NEXT: S_BRANCH
# GCN: bb.2:
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_write_sgpr_branch_around
@@ -237,7 +237,7 @@ body: |
# GCN: S_WAITCNT
# GCN: V_ADD_I32
# GCN: bb.2:
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_write_sgpr_cbranch_around
@@ -262,7 +262,7 @@ body: |
...
# GCN-LABEL: name: vmem_write_sgpr_branch_backedge
# GCN: $vgpr0 = IMPLICIT_DEF
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_write_sgpr_branch_backedge
@@ -280,7 +280,7 @@ body: |
...
# GCN-LABEL: name: ds_write_exec
# GCN: DS_WRITE_B32_gfx9
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: ds_write_exec
@@ -293,7 +293,7 @@ body: |
...
# GCN-LABEL: name: vmem_scratch_exec
# GCN: SCRATCH_LOAD_DWORD
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_scratch_exec
@@ -305,7 +305,7 @@ body: |
...
# GCN-LABEL: name: vmem_flat_exec
# GCN: FLAT_LOAD_DWORD
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_flat_exec
@@ -318,7 +318,7 @@ body: |
...
# GCN-LABEL: name: vmem_global_exec
# GCN: GLOBAL_LOAD_DWORD
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_global_exec
@@ -331,7 +331,7 @@ body: |
...
# GCN-LABEL: name: vmem_global_atomic_exec
# GCN: GLOBAL_ATOMIC_ADD_RTN
-# GCN-NEXT: V_NOP
+# GCN-NEXT: S_WAITCNT_DEPCTR 65507
# GCN-NEXT: S_MOV_B32
---
name: vmem_global_atomic_exec
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 55557e51b82c..388a75d148bd 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1059,7 +1059,7 @@ declare void @external_void_func_void() #1
; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}}
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT: v_nop
+; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
@@ -1082,7 +1082,7 @@ declare void @external_void_func_void() #1
; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT: v_nop
+; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
More information about the llvm-commits
mailing list