[llvm] [AMDGPU] Avoid unneded waitcounts before spill stores (PR #108303)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 11 15:26:20 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-backend-amdgpu
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
Implicit defs and uses on spill stores were accounted as real defs and uses, while only exist for liveness accounting. As a result unneded waits were generated.
Fixes: SWDEV-484177
---
Patch is 399.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108303.diff
19 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+1-7)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+187-162)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+17-30)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+18-15)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+11-3)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+35-13)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+37-15)
- (modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+133-118)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (-334)
- (added) llvm/test/CodeGen/AMDGPU/spill-wait.mir (+25)
- (modified) llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8ae7f2910ec5a9..ced1d0c508ebc8 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -901,7 +901,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
// Match the score to the destination registers.
- for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+ for (unsigned I = 0, E = Inst.getNumExplicitOperands(); I != E; ++I) {
auto &Op = Inst.getOperand(I);
if (!Op.isReg() || !Op.isDef())
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 1f1c2659e81103..fff06e4ad16c07 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -268,7 +268,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
@@ -294,7 +294,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
@@ -317,7 +317,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index fb96b9ff2952e8..49b450a9af0bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -185,7 +185,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -215,7 +215,6 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT: s_waitcnt vmcnt(9)
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v39 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse
@@ -1093,7 +1092,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -1124,7 +1123,6 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX90A-NEXT: buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT: s_waitcnt vmcnt(10)
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 75f5eda608e80a..51f9cf73488ee7 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -708,7 +708,6 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GCN-O0-NEXT: v_mov_b32_e32 v6, v2
; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
@@ -722,6 +721,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GCN-O0-NEXT: s_waitcnt vmcnt(4)
; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -1159,7 +1159,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: v_mov_b32_e32 v3, s10
; GCN-O0-NEXT: v_mov_b32_e32 v4, s11
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -1193,7 +1192,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: v_mov_b32_e32 v3, s10
; GCN-O0-NEXT: v_mov_b32_e32 v4, s11
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -1225,7 +1223,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -1247,7 +1244,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -1269,7 +1265,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -1343,7 +1338,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index fea1303d0a2b76..712cecff40617b 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -345,30 +345,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b32 s4, 63
; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[13:14]
; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19
; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5
; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17
; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v19
@@ -483,14 +483,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7]
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
@@ -538,10 +538,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec
; GFX9-O0-NEXT: v_writelane_b32 v0, s4, 4
@@ -572,16 +572,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(6)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(6)
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(6)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(6)
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_5
; GFX9-O0-NEXT: .LBB0_3: ; %Flow2
@@ -598,10 +601,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_9
; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit
@@ -638,10 +641,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_3
; GFX9-O0-NEXT: .LBB0_5: ; %Flow1
@@ -662,16 +665,16 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_branch .LBB0_4
; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while
@@ -756,11 +759,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v23
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15
; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
@@ -846,22 +850,22 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v2
; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0
; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14
; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v13
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/108303
More information about the llvm-commits
mailing list