[llvm] [AMDGPU] Disable SGPR read hazard mitigation for gfx1250 (PR #150344)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 23 17:30:33 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-globalisel

Author: Changpeng Fang (changpeng)

<details>
<summary>Changes</summary>



---

Patch is 215.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150344.diff


10 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+14-27) 
- (modified) llvm/test/CodeGen/AMDGPU/bf16-conversions.ll (-2) 
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+18-47) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll (+52-498) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll (+3-6) 
- (modified) llvm/test/CodeGen/AMDGPU/literal64.ll (+2-6) 
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll (+1-11) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b22d421b425be..0435e7f9e51d2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1308,7 +1308,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
 
-  bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
+  bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
 
   /// Return if operations acting on VGPR tuples require even alignment.
   bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 0d571d0e563b5..6cc192c570f8a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1217,11 +1217,9 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX1250-NEXT:    v_mov_b32_e32 v13, v10
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
 ; GFX1250-NEXT:    v_mul_lo_u32 v8, v8, v7
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v11, v8, s0
-; GFX1250-NEXT:    s_wait_alu 0xfffd
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v6
@@ -2865,19 +2863,17 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v1, v13, v[16:17]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v2, v12, v[16:17]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX1250-NEXT:    s_wait_alu 0xfffd
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], null, v0, v10, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v3, v11, v[16:17]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v4, v10, v[16:17]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -2887,65 +2883,56 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
 ; GFX1250-NEXT:    v_mov_b32_e32 v20, v19
-; GFX1250-NEXT:    s_wait_alu 0xfffd
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s0
 ; GFX1250-NEXT:    v_mov_b32_e32 v21, v22
 ; GFX1250-NEXT:    v_mul_lo_u32 v22, v5, v10
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17]
-; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v27, null, 0, v19, vcc_lo
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25
 ; GFX1250-NEXT:    v_mul_lo_u32 v25, v4, v11
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], null, v0, v8, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
 ; GFX1250-NEXT:    v_mul_lo_u32 v20, v2, v13
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v28, s2
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v21, s2
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
 ; GFX1250-NEXT:    v_mul_lo_u32 v0, v0, v15
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
 ; GFX1250-NEXT:    v_mul_lo_u32 v9, v1, v14
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, s2
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v4, s2, v27, v13, s2
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v6, v11, s2
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v23, v0, s2
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v9, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v20, s4
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v29, s3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v25, s1
-; GFX1250-NEXT:    s_wait_alu 0xfffd
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v26, s0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 6823a472c3ac6..752a87ac3cb73 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -218,7 +218,6 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v2, -1, 1, s1
 ; GFX1250-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[6:7]|
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v1, v8, v2 :: v_dual_bitop2_b32 v10, 1, v8 bitop3:0x40
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s1
 ; GFX1250-NEXT:    v_and_b32_e32 v11, 1, v9
@@ -229,7 +228,6 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
 ; GFX1250-NEXT:    s_or_b32 vcc_lo, s1, vcc_lo
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
 ; GFX1250-NEXT:    s_or_b32 vcc_lo, s2, s0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 7fec5f71ce8d5..2ad7818bd3ca8 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -168,7 +168,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
 ; GCN-NEXT:    s_sleep 0
 ; GCN-NEXT:    s_sleep 0
 ; GCN-NEXT:  .LBB3_2: ; %bb3
-; GCN-NEXT:    s_wait_alu 0xfffe
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off scope:SCOPE_SYS
 ; GCN-NEXT:    s_wait_storecnt 0x0
@@ -589,7 +588,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GCN-NEXT:    s_wait_alu 0xfffe
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
 ; GCN-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 4a634520c682e..b71885b54b5a2 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -822,10 +822,9 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -1803,10 +1802,9 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_sub_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX1250-NEXT:    s_wait_alu 0xf1ff
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -3136,26 +3134,22 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3)
 ; GFX1250-NEXT:    s_fmac_f32 s0, s1, 0x4f800000
 ; GFX1250-NEXT:    v_s_rcp_f32 s0, s0
-; GFX1250-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
 ; GFX1250-NEXT:    s_mul_f32 s0, s0, 0x5f7ffffc
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_mul_f32 s1, s0, 0x2f800000
-; GFX1250-NEXT:    s_wait_alu 0xfffe
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
 ; GFX1250-NEXT:    s_trunc_f32 s1, s1
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_fmac_f32 s0, s1, 0xcf800000
 ; GFX1250-NEXT:    s_cvt_u32_f32 s5, s1
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
 ; GFX1250-NEXT:    s_cvt_u32_f32 s4, s0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_u64 s[12:13], s[6:7], s[4:5]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s15, s4, s13
 ; GFX1250-NEXT:    s_mul_i32 s14, s4, s13
 ; GFX1250-NEXT:    s_mul_hi_u32 s0, s4, s12
 ; GFX1250-NEXT:    s_mul_i32 s17, s5, s12
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[0:1], s[14:15]
 ; GFX1250-NEXT:    s_mul_hi_u32 s16, s5, s12
 ; GFX1250-NEXT:    s_mul_hi_u32 s18, s5, s13
@@ -3163,99 +3157,82 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_add_co_ci_u32 s0, s15, s16
 ; GFX1250-NEXT:    s_mul_i32 s12, s5, s13
 ; GFX1250-NEXT:    s_add_co_ci_u32 s13, s18, 0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[0:1], s[12:13]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_add_co_u32 v0, s0, s4, s12
 ; GFX1250-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s5, s5, s13
 ; GFX1250-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_mul_u64 s[6:7], s[6:7], s[4:5]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s4, s7
 ; GFX1250-NEXT:    s_mul_i32 s12, s4, s7
 ; GFX1250-NEXT:    s_mul_hi_u32 s0, s4, s6
 ; GFX1250-NEXT:    s_mul_i32 s15, s5, s6
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[0:1], s[12:13]
 ; GFX1250-NEXT:    s_mul_hi_u32 s14, s5, s6
 ; GFX1250-NEXT:    s_mul_hi_u32 s4, s5, s7
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_add_co_u32 s0, s12, s15
 ; GFX1250-NEXT:    s_add_co_ci_u32 s0, s13, s14
 ; GFX1250-NEXT:    s_mul_i32 s6, s5, s7
 ; GFX1250-NEXT:    s_add_co_ci_u32 s7, s4, 0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[6:7], s[0:1], s[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_add_co_u32 v0, s0, v0, s6
 ; GFX1250-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s0, s5, s7
 ; GFX1250-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_mul_hi_u32 s5, s10, s0
 ; GFX1250-NEXT:    s_mul_i32 s4, s10, s0
 ; GFX1250-NEXT:    s_mul_hi_u32 s12, s11, s0
 ; GFX1250-NEXT:    s_mul_i32 s6, s11, s0
 ; GFX1250-NEXT:    s_mul_hi_u32 s0, s10, s7
 ; GFX1250-NEXT:    s_mul_i32 s13, s11, s7
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_add_nc_u64 s[4:5], s[0:1], s[4:5]
 ; GFX1250-NEXT:    s_mul_hi_u32 s0, s11, s7
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_add_co_u32 s4, s4, s13
 ; GFX1250-NEXT:    s_add_co_ci_u32 s0, s5, s0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s7, s12, 0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[4:5], s[0:1], s[6:7]
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_and_b64 s[6:7], s[4:5], lit64(0xffffffff00000000)
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_or_b32 s6, s6, s4
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_mul_u64 s[4:5], s[2:3], s[6:7]
 ; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[6:7], 2
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    v_sub_co_u32 v0, s0, s10, s4
 ; GFX1250-NEXT:    s_sub_co_i32 s4, s11, s5
 ; GFX1250-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1250-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
 ; GFX1250-NEXT:    v_sub_co_u32 v1, s12, v0, s2
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s4, s4, s3
 ; GFX1250-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[6:7], 1
 ; GFX1250-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v1
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s4, s4, 0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_cmp_ge_u32 s4, s3
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX1250-NEXT:    s_cmp_eq_u32 s4, s3
 ; GFX1250-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1250-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v1, s14, v1, vcc_lo
 ; GFX1250-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s0, s11, s5
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_cmp_ge_u32 s0, s3
-; GFX1250-NEXT:    s_wait_alu 0xfffd
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1250-NEXT:    s_cmp_eq_u32 s0, s3
 ; GFX1250-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s4, v0, s0
-; GFX1250-NEXT:    s_wait_alu 0xfffd
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v2, s12, v2, vcc_lo
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v1, s13, v3, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1250-NEXT:    s_wait_alu 0xfffd
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v1, s7, v1, vcc_lo
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, s6, v2, vcc_lo
 ; GFX1250-NEXT:    s_cbranch_execnz .LBB16_3
@@ -3269,31 +3246,25 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_mul_i32 s1, s1, s0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s1, s0, s1
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_add_co_i32 s0, s0, s1
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s0, s10, s0
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_mul_i32 s1, s0, s2
 ; GFX1250-NEXT:    s_add_co_i32 s3, s0, 1
-; GFX1250-NEXT:    s_wait_alu 0xfffe
 ; GFX1250-NEXT:    s_sub_co_i32 s1, s10, s1
-; GFX1250-NEXT:    s_wait_alu 0xfffe
+; GFX1250-NEXT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/150344


More information about the llvm-commits mailing list