[llvm] [AMDGPU] Remove s_delay_alu for VALU->SGPR->SALU (PR #127212)
Ana Mihajlovic via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 21 07:50:13 PST 2025
https://github.com/mihajlovicana updated https://github.com/llvm/llvm-project/pull/127212
>From 10873d7d83fcb34bde3fc44e06f347fe66b0d305 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Sat, 15 Feb 2025 02:46:58 +0100
Subject: [PATCH 1/7] [AMDGPU] Remove s_delay_alu for VALU->SGPR->SALU
---
.../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 5 +-
.../GlobalISel/llvm.amdgcn.intersect_ray.ll | 8 +--
.../AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll | 1 -
.../AMDGPU/atomic_optimizations_buffer.ll | 12 ----
.../atomic_optimizations_global_pointer.ll | 20 ++----
.../atomic_optimizations_local_pointer.ll | 70 ++++++-------------
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 8 ---
.../atomic_optimizations_struct_buffer.ll | 8 ---
llvm/test/CodeGen/AMDGPU/bf16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 1 -
.../buffer-fat-pointer-atomicrmw-fadd.ll | 32 +++------
.../buffer-fat-pointer-atomicrmw-fmax.ll | 34 +++------
.../buffer-fat-pointer-atomicrmw-fmin.ll | 34 +++------
.../test/CodeGen/AMDGPU/carryout-selection.ll | 13 ++--
.../CodeGen/AMDGPU/combine-add-zext-xor.ll | 16 ++---
.../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 39 +----------
.../expand-scalar-carry-out-select-user.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 7 +-
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 16 ++---
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 17 ++---
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 27 ++++---
llvm/test/CodeGen/AMDGPU/fp-classify.ll | 8 +--
llvm/test/CodeGen/AMDGPU/fptrunc.ll | 7 +-
llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 16 ++---
.../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 12 ++--
llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir | 3 +
.../insert_waitcnt_for_precise_memory.ll | 10 ++-
...e92561-restore-undef-scc-verifier-error.ll | 12 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll | 4 +-
...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 1 -
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 12 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 12 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll | 1 -
.../AMDGPU/llvm.amdgcn.s.ttracedata.ll | 1 -
....amdgcn.struct.buffer.load.format.v3f16.ll | 4 +-
...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 2 -
...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 2 -
...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 2 -
...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 7 +-
...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 7 +-
...gcn.struct.ptr.buffer.load.format.v3f16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 12 ++--
llvm/test/CodeGen/AMDGPU/min.ll | 4 --
...uf-legalize-operands-non-ptr-intrinsics.ll | 20 +++---
.../CodeGen/AMDGPU/mubuf-legalize-operands.ll | 20 +++---
.../CodeGen/AMDGPU/no-dup-inst-prefetch.ll | 3 +-
.../AMDGPU/pseudo-scalar-transcendental.ll | 2 +-
llvm/test/CodeGen/AMDGPU/saddo.ll | 9 ++-
llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 2 -
llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll | 5 +-
llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 6 +-
...r-descriptor-waterfall-loop-idom-update.ll | 2 +-
54 files changed, 200 insertions(+), 398 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 3f2bb5df8836b..7eb608fc93e63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -371,7 +371,10 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
auto It = State.find(Unit);
if (It != State.end()) {
- Delay.merge(It->second);
+ if (!(SII->isSALU(MI.getOpcode())) ||
+ !AMDGPU::isSGPR(Op.getReg(), TRI) ||
+ It->second.VALUCycles == 0)
+ Delay.merge(It->second);
State.erase(Unit);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index b26ddbdd7a342..12ebf3f6879d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -234,8 +234,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7]
; GFX11-NEXT: ; implicit-def: $vgpr18
@@ -360,8 +360,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16
; GFX11-NEXT: ; implicit-def: $vgpr18
@@ -476,8 +476,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -604,8 +604,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16
; GFX11-NEXT: ; implicit-def: $vgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
index af50f56a87226..7d084582273d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
@@ -1468,7 +1468,6 @@ define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; encoding: [0x00,0x05,0x00,0x7e]
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0 ; encoding: [0x01,0x10,0x00,0xb9]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index cd405fabf002d..4b68f8a4bd194 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -777,7 +777,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -822,7 +821,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -864,7 +862,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -910,7 +907,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -1178,7 +1174,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1226,7 +1221,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1270,7 +1264,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1319,7 +1312,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -2246,7 +2238,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2291,7 +2282,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2334,7 +2324,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2380,7 +2369,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 8bb8ecb079a34..2bcce6c04c0bb 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -899,7 +899,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -950,7 +949,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -999,7 +997,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -1049,7 +1046,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
@@ -2576,17 +2572,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2639,7 +2634,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
@@ -4454,7 +4448,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
@@ -4505,7 +4498,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
@@ -4554,7 +4546,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
@@ -4604,7 +4595,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
@@ -6164,17 +6154,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6227,7 +6216,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 3c0646c46efd0..eb5353e928682 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -669,7 +669,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -715,7 +714,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -1215,7 +1213,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -1248,7 +1246,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
@@ -2217,17 +2215,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2275,7 +2272,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
@@ -3019,11 +3015,10 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
@@ -3059,7 +3054,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
@@ -4091,7 +4085,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
@@ -4137,7 +4130,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
@@ -4637,7 +4629,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -4670,7 +4662,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
@@ -5662,17 +5654,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5720,7 +5711,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
@@ -6508,7 +6498,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
@@ -6554,7 +6543,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
@@ -7873,7 +7861,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
@@ -7919,7 +7906,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
@@ -9237,7 +9223,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
@@ -9283,7 +9268,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
@@ -10601,7 +10585,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
@@ -10647,7 +10630,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
@@ -11516,13 +11498,12 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -11574,13 +11555,12 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -12435,7 +12415,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
@@ -12481,7 +12460,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
@@ -13350,13 +13328,12 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -13408,13 +13385,12 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -14269,7 +14245,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
@@ -14315,7 +14290,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
@@ -15173,14 +15147,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -15230,14 +15204,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7]
; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -16088,7 +16062,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
@@ -16134,7 +16107,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
@@ -16993,14 +16965,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -17050,14 +17022,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7]
; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 8c6224cc86284..0a06fe4ea949e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -776,7 +776,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -821,7 +820,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -863,7 +861,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -909,7 +906,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -1834,7 +1830,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1
@@ -1879,7 +1874,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1
@@ -1922,7 +1916,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1
@@ -1968,7 +1961,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 63b46eba41225..bc0bec4772e52 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -797,7 +797,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -843,7 +842,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -885,7 +883,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -932,7 +929,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -2006,7 +2002,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2052,7 +2047,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2095,7 +2089,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2142,7 +2135,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 0382cc72a36ae..7a8e44d640523 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2358,17 +2358,17 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index a03ad4daab014..309233098e175 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -264,7 +264,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB2_1
; GFX11-NEXT: ; %bb.3: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index ba2694fca99fa..af67012718fd7 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -386,7 +386,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -444,8 +443,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
@@ -2374,7 +2373,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -2406,7 +2404,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -2474,10 +2471,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -2505,8 +2501,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
@@ -4120,7 +4116,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4156,7 +4151,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4267,8 +4261,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -4300,8 +4294,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
@@ -5564,7 +5558,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -5611,7 +5604,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -5730,8 +5722,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -5774,8 +5766,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -6739,7 +6731,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6798,8 +6789,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -6827,8 +6818,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
@@ -9058,7 +9049,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -9167,8 +9157,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -9215,8 +9205,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 38adf60888eca..96b9964e39dc4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -378,7 +378,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -469,8 +468,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_max_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
@@ -1587,7 +1586,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -1621,7 +1619,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -1689,10 +1686,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -1722,8 +1718,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
@@ -3221,7 +3217,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -3260,7 +3255,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -3373,8 +3367,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -3409,8 +3403,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -4685,7 +4679,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4732,7 +4725,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4851,8 +4843,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -4895,8 +4887,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -6014,7 +6006,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6048,7 +6039,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6146,8 +6136,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -6177,8 +6167,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
@@ -7467,7 +7457,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -7519,7 +7508,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -7634,8 +7622,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -7682,8 +7670,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 2b8cea9068d87..1cf4fa6da1627 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -378,7 +378,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -469,8 +468,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_min_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
@@ -1587,7 +1586,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -1621,7 +1619,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -1689,10 +1686,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -1722,8 +1718,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
@@ -3221,7 +3217,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -3260,7 +3255,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -3373,8 +3367,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -3409,8 +3403,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -4685,7 +4679,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4732,7 +4725,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4851,8 +4843,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -4895,8 +4887,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -6014,7 +6006,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6048,7 +6039,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6146,8 +6136,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -6177,8 +6167,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
@@ -7467,7 +7457,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -7519,7 +7508,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -7634,8 +7622,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -7682,8 +7670,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index cdea4fd158b04..03c63063101bb 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2759,7 +2759,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_mul_i32 s7, s5, s0
; GFX11-NEXT: s_mul_hi_u32 s13, s5, s1
; GFX11-NEXT: s_mul_i32 s12, s6, s1
@@ -2781,12 +2780,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_u32 s1, s1, s7
; GFX11-NEXT: s_addc_u32 s7, 0, s12
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s7, s5, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1
; GFX11-NEXT: s_mul_i32 s6, s6, s1
; GFX11-NEXT: s_add_i32 s7, s12, s7
@@ -2807,9 +2805,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_u32 s1, s1, s7
; GFX11-NEXT: s_addc_u32 s5, 0, s5
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s6, s10, s0
; GFX11-NEXT: s_mul_hi_u32 s5, s10, s0
@@ -2881,18 +2879,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_mul_i32 s1, s1, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1
-; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: s_mul_hi_u32 s0, s10, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s1, s0, s2
; GFX11-NEXT: s_add_i32 s3, s0, 1
; GFX11-NEXT: s_sub_i32 s1, s10, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s4, s1, s2
; GFX11-NEXT: s_cmp_ge_u32 s1, s2
; GFX11-NEXT: s_cselect_b32 s0, s3, s0
diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
index a25078230385e..8dc781fdb67a7 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
@@ -38,12 +38,12 @@ define i32 @combine_add_zext_xor() {
; GFX1100-NEXT: s_branch .LBB0_2
; GFX1100-NEXT: .LBB0_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB0_4
; GFX1100-NEXT: .LBB0_2: ; %.a
@@ -118,12 +118,12 @@ define i32 @combine_sub_zext_xor() {
; GFX1100-NEXT: s_branch .LBB1_2
; GFX1100-NEXT: .LBB1_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB1_4
; GFX1100-NEXT: .LBB1_2: ; %.a
@@ -365,11 +365,11 @@ define i32 @combine_add_zext_and() {
; GFX1100-NEXT: s_branch .LBB4_2
; GFX1100-NEXT: .LBB4_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_add_nc_u32_e32 v1, v1, v0
; GFX1100-NEXT: s_cbranch_vccz .LBB4_4
; GFX1100-NEXT: .LBB4_2: ; %.a
@@ -444,11 +444,11 @@ define i32 @combine_sub_zext_and() {
; GFX1100-NEXT: s_branch .LBB5_2
; GFX1100-NEXT: .LBB5_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_sub_nc_u32_e32 v1, v1, v0
; GFX1100-NEXT: s_cbranch_vccz .LBB5_4
; GFX1100-NEXT: .LBB5_2: ; %.a
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index d61c4b46596c0..e0a83c7644b14 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -320,7 +320,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB3_1
@@ -348,7 +347,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -441,7 +439,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB4_1
@@ -469,7 +466,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB4_1
@@ -556,7 +552,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB5_1
@@ -584,7 +579,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB5_1
@@ -744,7 +738,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4
; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s2, s2, s5
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_2
@@ -761,7 +754,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc
@@ -799,7 +791,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_2
@@ -962,7 +953,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB7_2
@@ -1011,7 +1001,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-GISEL-NEXT: s_bitset0_b32 s0, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s4
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_2
@@ -1132,7 +1121,6 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB8_1
@@ -1162,7 +1150,6 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB8_1
@@ -1266,7 +1253,7 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100
; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
@@ -1300,7 +1287,7 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100
; GFX11-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
@@ -1397,7 +1384,6 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB10_1
@@ -1427,7 +1413,6 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB10_1
@@ -1525,7 +1510,6 @@ define void @test_dynamic_stackalloc_device_divergent() {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB11_1
@@ -1557,7 +1541,6 @@ define void @test_dynamic_stackalloc_device_divergent() {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB11_1
@@ -1670,7 +1653,6 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB12_1
@@ -1705,7 +1687,6 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB12_1
@@ -1805,7 +1786,6 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB13_1
@@ -1837,7 +1817,6 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB13_1
@@ -2048,7 +2027,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
@@ -2069,7 +2047,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5
; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_4
@@ -2094,7 +2071,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v1, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
@@ -2137,7 +2113,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s5, v2, s4
; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_2
@@ -2155,7 +2130,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s6, v1, s5
; GFX11-GISEL-NEXT: s_bitset0_b32 s4, s5
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s3, s3, s6
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_4
@@ -2179,7 +2153,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_7
@@ -2374,7 +2347,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v1, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
@@ -2402,7 +2374,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_6
@@ -2444,7 +2415,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_2
@@ -2473,7 +2443,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_6
@@ -2585,7 +2554,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB16_1
@@ -2617,7 +2585,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB16_1
@@ -2711,7 +2678,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB17_1
@@ -2741,7 +2707,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB17_1
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 598cdddaa53d1..f3aec696abdee 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -66,10 +66,10 @@ define i32 @s_add_co_select_user() {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s1, s0, 0
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
; GFX11-NEXT: s_cselect_b32 s1, s1, 0
; GFX11-NEXT: s_cmp_gt_u32 s0, 31
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index a457338873157..fe60963534846 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1708,10 +1708,9 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
; GFX11-NEXT: s_lshl_b32 s7, s2, 12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s6, v1
; GFX11-NEXT: s_or_b32 s3, s5, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_or_b32 s5, s3, 0x1000
; GFX11-NEXT: s_or_b32 s7, s3, s7
; GFX11-NEXT: s_lshr_b32 s6, s5, s6
@@ -1720,13 +1719,13 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-NEXT: s_or_b32 s5, s6, s5
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-NEXT: s_cselect_b32 s5, s5, s7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s6, s5, 7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-NEXT: s_cselect_b32 s7, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s6, 3
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 0b6bdedeb48fc..5dd96237685dd 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -15303,9 +15303,9 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15465,9 +15465,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15639,9 +15639,9 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15807,9 +15807,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15964,9 +15964,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16120,9 +16120,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -16287,9 +16287,9 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16449,9 +16449,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index a33fd03e0ce03..fd9940c882821 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -329,10 +329,9 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -344,10 +343,9 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -381,7 +379,7 @@ define i32 @test_D139469_f16(half %arg) {
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
@@ -507,10 +505,9 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3.l
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -530,10 +527,9 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -580,12 +576,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: s_or_b32 s0, s1, s2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index b32630a97b3ad..4d40cefb5c987 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -3103,17 +3103,17 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00
; GFX11-NEXT: v_and_b32_e32 v4, 7, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fpround = fptrunc double %a to half
@@ -3251,15 +3251,14 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg double %a
@@ -3406,17 +3405,16 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_or_b32 v3, 0x8000, v5, v2
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg double %a
@@ -3564,15 +3562,14 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3718,16 +3715,16 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v1, 0x8000, v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fpround = fptrunc double %a to half
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index cc11e256d5544..6a0d52962265d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -316,8 +316,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -368,8 +368,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2
; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s3|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -423,8 +423,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2
; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -578,8 +578,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3
; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 051a0c51b0867..647578b57c772 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -434,10 +434,9 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7
; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6
@@ -445,13 +444,13 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6
; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5
; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0
; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 5fab0c50bbe57..0c5b8b096d910 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -112,11 +112,10 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s2, s2, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
; GFX11-NEXT: s_mov_b64 s[2:3], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
@@ -272,11 +271,10 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s2, s2, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
; GFX11-NEXT: s_mov_b64 s[2:3], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
@@ -430,11 +428,10 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s4, s4, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: s_add_i32 s5, s5, s6
@@ -576,11 +573,10 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s3, s3, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_add_i32 s4, s4, s5
@@ -960,16 +956,15 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v2|, |v0|
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: s_add_i32 s3, s3, 1
; GFX11-NEXT: v_mov_b32_e32 v3, s5
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
; GFX11-NEXT: s_cbranch_scc0 .LBB6_1
@@ -1097,7 +1092,6 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0|
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s5, s5, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index ea3d57d127151..44b1bb25bc057 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -118,14 +118,14 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, 1
-; GFX11-NEXT: s_lshr_b32 s0, s0, s30
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshr_b32 s0, s0, s30
; GFX11-NEXT: s_mul_i32 s0, s0, s22
-; GFX11-NEXT: s_mul_i32 s0, s0, s20
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_mul_i32 s0, s0, s20
; GFX11-NEXT: s_or_b32 s0, s19, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1
; GFX11-NEXT: s_mov_b32 s0, s1
; GFX11-NEXT: global_load_u16 v1, v0, s[20:21]
@@ -145,7 +145,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
; GFX11-NEXT: s_and_b32 s1, s8, s1
; GFX11-NEXT: s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
; GFX11-NEXT: s_cselect_b32 s1, s19, s13
; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
@@ -155,12 +155,12 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
; GFX11-NEXT: v_readfirstlane_b32 s13, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
; GFX11-NEXT: s_cselect_b32 s13, s19, s13
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_bitcmp1_b32 s13, 0
; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s13, s0
; GFX11-NEXT: s_cbranch_vccz .LBB2_6
; GFX11-NEXT: ; %bb.7: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
index 5c9c0d1119163..9a2b2bebd16fe 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
---
@@ -559,3 +560,5 @@ body: |
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 5256cbcef123a..2f37b45651234 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -555,17 +555,16 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s5, s5, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s5
-; GFX11-NEXT: s_add_i32 s4, s4, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s4, s4, s5
; GFX11-NEXT: s_mul_hi_u32 s4, s2, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s5, s4, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s2, s2, s5
; GFX11-NEXT: s_add_i32 s5, s4, 1
; GFX11-NEXT: s_sub_i32 s6, s2, s3
@@ -590,12 +589,11 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_cvt_u32_f32 s4, s4
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_i32 s5, s5, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index f961e857f39e5..2053ae970c773 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -33,14 +33,12 @@ define void @issue92561(ptr addrspace(1) %arg) {
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; SDAG-NEXT: v_cmp_eq_u64_e64 s1, s[8:9], v[4:5]
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: v_cmp_eq_u64_e64 s2, s[10:11], v[6:7]
; SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: s_and_b32 s0, s0, s1
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: s_and_b32 s0, s0, s2
-; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-NEXT: s_and_saveexec_b32 s0, s0
; SDAG-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[4:11], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
@@ -108,14 +106,12 @@ define void @issue92561(ptr addrspace(1) %arg) {
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[12:13], v[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[14:15], v[6:7]
; GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[16:17], v[0:1]
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GISEL-NEXT: v_cmp_eq_u64_e64 s2, s[18:19], v[2:3]
; GISEL-NEXT: s_and_b32 s0, vcc_lo, s0
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GISEL-NEXT: s_and_b32 s0, s0, s1
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GISEL-NEXT: s_and_b32 s0, s0, s2
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_and_saveexec_b32 s0, s0
; GISEL-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
index 93bc7155cbfa4..ed43612f738c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
@@ -135,7 +135,7 @@ define amdgpu_kernel void @id_row_i32() #0 {
; GFX11-SDAG-LABEL: id_row_i32:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
; GFX11-SDAG-NEXT: s_mov_b32 m0, s0
@@ -154,7 +154,7 @@ define amdgpu_kernel void @id_row_i32() #0 {
; GFX12-SDAG-LABEL: id_row_i32:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
index 08d2201036c77..8b6ba1a3cc094 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -83,7 +83,6 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index f72f1e52d135f..deeceed3a19be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -428,7 +428,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -449,7 +448,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -471,7 +469,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -492,7 +489,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -809,7 +805,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_max_u32 s6, s6, s8
@@ -847,7 +843,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_max_u32 s6, s6, s8
@@ -884,7 +880,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6
@@ -922,7 +918,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 4551c60770bdf..434e761a5f8a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -429,7 +429,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -450,7 +449,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -472,7 +470,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -493,7 +490,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -810,7 +806,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_min_u32 s6, s6, s8
@@ -848,7 +844,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_mov_b32 s6, -1
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_min_u32 s6, s6, s8
@@ -885,7 +881,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6
@@ -923,7 +919,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, -1
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
index cf86e2e1dedee..f2ee110c28c6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
@@ -13,7 +13,6 @@ define void @test_s_sleep_var1(i32 %arg) {
; GCN-NEXT: s_wait_bvhcnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_sleep_var s0
; GCN-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.sleep.var(i32 %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
index 5dec1e15cb3d5..b918941d62ffc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
@@ -29,7 +29,6 @@ define amdgpu_cs void @ttracedata_v(i32 %val) {
; GFX11-SDAG-LABEL: ttracedata_v:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_mov_b32 m0, s0
; GFX11-SDAG-NEXT: s_ttracedata
; GFX11-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index fed7a8ec105fd..ea8703df080d9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -99,8 +99,8 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -126,8 +126,8 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
index a2b9c869c9c9a..148a5ba75d98b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -52,7 +52,6 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
@@ -93,7 +92,6 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index c2a0028f4f1f1..746b8791c39f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -276,7 +276,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
@@ -415,7 +414,6 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 104462a506c8c..71c63bfd69734 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -221,7 +221,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
@@ -331,7 +330,6 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
index 13bb72a96142f..e3889ab8f5a21 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -428,8 +428,8 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s1, s1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_max_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
@@ -459,7 +459,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
@@ -578,10 +577,9 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
@@ -616,7 +614,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
index e75dd7409d51b..f001bf97fcd9e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -428,8 +428,8 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s1, s1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_min_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
@@ -459,7 +459,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
@@ -578,10 +577,9 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
@@ -616,7 +614,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
index 46b2516f72f8e..9018160806925 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
@@ -97,8 +97,8 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 1fc7349882ba1..a9240eff8e691 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -1385,8 +1385,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_and_b32 v0, 0xffff, v0
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003
; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10005
@@ -1690,7 +1690,6 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s6, 0xffff, s2
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001
@@ -4904,13 +4903,13 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10002
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_and_b32 s2, s2, 1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5658,10 +5657,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_mov_b32_e32 v7, v1
; GFX12-NEXT: v_mov_b32_e32 v11, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_bfe_u32 v2, v4, 11, 1
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000a
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c
@@ -6004,10 +6002,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: global_load_u16 v0, v32, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s3, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshr_b32 s4, s3, 15
; GFX12-NEXT: s_lshr_b32 s2, s3, 14
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: s_lshr_b32 s6, s3, 12
; GFX12-NEXT: s_lshr_b32 s8, s3, 13
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index a6db7d331cef3..aaf81e2fa4000 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -3476,7 +3476,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
@@ -3584,7 +3583,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
@@ -3692,7 +3690,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
@@ -3800,7 +3797,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index e44803d611f84..8426224d9dd50 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -97,8 +97,8 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -123,8 +123,8 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -399,8 +399,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -418,8 +418,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -448,8 +448,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -467,8 +467,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -910,8 +910,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[10:11], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -936,8 +936,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -970,8 +970,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -996,8 +996,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 896cb6042e810..1480743e435ff 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -96,8 +96,8 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -122,8 +122,8 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -410,8 +410,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -429,8 +429,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -459,8 +459,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -478,8 +478,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -945,8 +945,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[10:11], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -971,8 +971,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -1005,8 +1005,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -1031,8 +1031,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index d62f045674ace..f43ca4fdc1762 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -92,10 +92,9 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX12-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: v_fma_f32 v1, v1, v0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_cmp_le_f32_e64 s0, 0, v1
; GFX12-NEXT: s_and_b32 s0, s0, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b32 s2, s2, s0
; GFX12-NEXT: s_branch .LBB0_1
; GFX12-NEXT: .LBB0_4: ; %loop0_merge
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index 1e6106896e0a4..aaac4212e003b 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -217,7 +217,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-SDAG-NEXT: s_mul_f32 s0, s2, 0x37800000
; GFX12-SDAG-NEXT: v_cmp_class_f32_e64 s3, s1, 0x260
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_cselect_b32 s0, s0, s2
; GFX12-SDAG-NEXT: s_and_b32 s2, s3, exec_lo
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index b4eb775008122..a63d9f22236d5 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -106,11 +106,11 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX11-NEXT: s_addc_u32 s7, s3, s5
; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0
; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s2, s4, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -439,8 +439,8 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
; GFX11-NEXT: v_mov_b32_e32 v0, s8
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s4, s6, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -560,10 +560,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 94b22b79f6632..0b68a0534fa08 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -452,10 +452,9 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -482,10 +481,9 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 0501602bbd8f4..12f8dd597684d 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1349,7 +1349,6 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1
; GFX11-LABEL: no_skip_no_successors:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b64 vcc, exec, s[4:5]
; GFX11-NEXT: s_cbranch_vccz .LBB12_3
; GFX11-NEXT: ; %bb.1: ; %bb6
@@ -1361,7 +1360,6 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1
; GFX11-NEXT: s_mov_b64 exec, 0
; GFX11-NEXT: .LBB12_3: ; %bb3
; GFX11-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX11-NEXT: ; %bb.4: ; %bb5
; GFX11-NEXT: .LBB12_5:
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 2a2fd93bc2d0b..eb1b844ad8938 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -452,10 +452,9 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -482,10 +481,9 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
index 5b40d53e0a81c..bff5c6c0db365 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
@@ -6,9 +6,9 @@ define amdgpu_kernel void @icmp_test() {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_cmp_eq_u16_e64 s[0:1], 0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: ds_store_b32 v1, v0
; CHECK-NEXT: s_endpgm
@@ -27,11 +27,10 @@ define amdgpu_kernel void @fcmp_test(half %x, half %y) {
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s1, s0, 16
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: v_cmp_le_f16_e64 s[0:1], s0, s1
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: ds_store_b32 v1, v0
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index 5360ff2fa402f..a3f632267ccd6 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -1625,10 +1625,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX11-NEXT: global_store_b8 v0, v1, s[8:9]
; GFX11-NEXT: s_endpgm
@@ -1648,10 +1647,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX12-NEXT: global_store_b8 v0, v1, s[8:9]
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index 0211c5111c31d..e30c8a53b0571 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -57,8 +57,8 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_store_b32 v0, v0, s[4:7], 0 offen
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
>From 8db12e2c8999f6f90a3e6de6d9a927f775e816d5 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Thu, 20 Feb 2025 13:22:58 +0100
Subject: [PATCH 2/7] added cycle reduction for instructions issued between
VALU->SGPR and SPGR->SALU
---
.../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 49 +++++++++++-
llvm/test/CodeGen/AMDGPU/bf16.ll | 5 +-
.../test/CodeGen/AMDGPU/carryout-selection.ll | 4 +-
.../CodeGen/AMDGPU/combine-add-zext-xor.ll | 6 +-
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 8 --
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 18 ++---
llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 7 +-
llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir | 74 +++++++++++++++++++
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 3 +-
9 files changed, 137 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 7eb608fc93e63..11773f0576f91 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -340,6 +340,11 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
bool Changed = false;
MachineInstr *LastDelayAlu = nullptr;
+ bool VALUSALUStall = false;
+ MCRegUnit lastSgprWrite = 0;
+ MCRegUnit longestWait = 0;
+ unsigned deletedCyclesNum = 0;
+
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
for (auto &MI : MBB.instrs()) {
@@ -371,15 +376,51 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
auto It = State.find(Unit);
if (It != State.end()) {
- if (!(SII->isSALU(MI.getOpcode())) ||
- !AMDGPU::isSGPR(Op.getReg(), TRI) ||
- It->second.VALUCycles == 0)
+ if (SII->isSALU(MI.getOpcode()) &&
+ AMDGPU::isSGPR(Op.getReg(), TRI) &&
+ It->second.VALUCycles > 0) {
+ deletedCyclesNum = It->second.VALUCycles;
+ State.erase(Unit);
+ VALUSALUStall = true;
+ } else {
Delay.merge(It->second);
- State.erase(Unit);
+ State.erase(Unit);
+ }
}
}
}
}
+ unsigned maxCycles = 0;
+ unsigned lastWrite = 0;
+ if (Type != OTHER) {
+ for (const auto &Op : MI.defs()) {
+ for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
+ if (AMDGPU::isSGPR(Op.getReg(), TRI)) {
+ maxCycles =
+ (State.find(longestWait) == State.end())
+ ? std::max(deletedCyclesNum, (unsigned)0)
+ : std::max(State[longestWait].VALUCycles,
+ State[longestWait].SALUCycles);
+ lastWrite =
+ (State.find(lastSgprWrite) == State.end())
+ ? 0
+ : std::max(State[lastSgprWrite].VALUCycles,
+ State[lastSgprWrite].SALUCycles);
+ if (maxCycles <= lastWrite)
+ longestWait = lastSgprWrite;
+ lastSgprWrite = Unit;
+ }
+ }
+ }
+ }
+
+ if (VALUSALUStall) {
+ State.advance(VALU, maxCycles);
+ VALUSALUStall = false;
+ lastSgprWrite = 0;
+ longestWait = 0;
+ }
+
if (Emit && !MI.isBundledWithPred()) {
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
// just ignore them?
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 7a8e44d640523..19a41d02240e0 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2359,16 +2359,15 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 03c63063101bb..1bf38a4b51718 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2782,7 +2782,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s7, s5, s0
; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1
@@ -2807,7 +2807,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s6, s10, s0
; GFX11-NEXT: s_mul_hi_u32 s5, s10, s0
diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
index 8dc781fdb67a7..2839b0395f9cf 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
@@ -39,11 +39,10 @@ define i32 @combine_add_zext_xor() {
; GFX1100-NEXT: .LBB0_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB0_4
; GFX1100-NEXT: .LBB0_2: ; %.a
@@ -119,11 +118,10 @@ define i32 @combine_sub_zext_xor() {
; GFX1100-NEXT: .LBB1_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB1_4
; GFX1100-NEXT: .LBB1_2: ; %.a
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 5dd96237685dd..07c9521e7646a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -15305,7 +15305,6 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15467,7 +15466,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15641,7 +15639,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15809,7 +15806,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15966,7 +15962,6 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16122,7 +16117,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -16289,7 +16283,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16451,7 +16444,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 4d40cefb5c987..12350c9d17ee0 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -3103,17 +3103,16 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00
; GFX11-NEXT: v_and_b32_e32 v4, 7, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fpround = fptrunc double %a to half
@@ -3252,13 +3251,13 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg double %a
@@ -3563,13 +3562,13 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3716,15 +3715,14 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v1, 0x8000, v1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fpround = fptrunc double %a to half
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 0c5b8b096d910..55a61e299768d 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -959,12 +959,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: s_add_i32 s3, s3, 1
; GFX11-NEXT: v_mov_b32_e32 v3, s5
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
; GFX11-NEXT: s_cbranch_scc0 .LBB6_1
@@ -1094,17 +1094,16 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s5, s5, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, s5, v2
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
; GFX11-NEXT: s_add_i32 s3, s3, 1
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
; GFX11-NEXT: v_mov_b32_e32 v3, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
; GFX11-NEXT: s_cbranch_scc0 .LBB7_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
index 9a2b2bebd16fe..86219940ebcd9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -560,5 +560,79 @@ body: |
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
+
+# Check if s_delay_alu is added
+---
+name: redundant_delay_alu_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: redundant_delay_alu_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ $sgpr0= S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+...
+
+# Check if s_delay_alu is added
+---
+name: delay_alu
+body: |
+ bb.0:
+ ; CHECK-LABEL: delay_alu:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+...
+
+# Check if reduntant delay_alu is removed
+---
+name: redundant_delay_alu_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: redundant_delay_alu_2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s6, s7
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ $sgpr6_sgpr7 = V_CMP_EQ_U32_e64 $sgpr6, $sgpr7, implicit $exec
+ $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+...
+
+# Check if reduntant delay_alu is removed
+---
+name: perserved_delay
+body: |
+ bb.0:
+ ; CHECK-LABEL: perserved_delay:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+ ; CHECK-NEXT: s_or_b32 s2, s0, s0
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ liveins : $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
+ $sgpr2 = S_OR_B32 $sgpr0, $sgpr0, implicit-def $scc
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+...
## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index a9240eff8e691..af4fb0c4d6f6e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -4904,12 +4904,11 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10002
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_and_b32 s2, s2, 1
; GFX12-NEXT: s_wait_alu 0xfffe
>From 5268ef5ae7582bfe7c220427e5b2a9efe9ab8f59 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 21 Feb 2025 11:25:47 +0100
Subject: [PATCH 3/7] added function that skips reduction for VALU->VGPR
---
.../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 57 +-
.../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 4 +-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll | 24 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 26 +-
.../AMDGPU/atomic_optimizations_buffer.ll | 13 -
.../atomic_optimizations_global_pointer.ll | 64 +-
.../atomic_optimizations_local_pointer.ll | 43 +-
.../atomic_optimizations_pixelshader.ll | 3 +-
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 11 -
.../atomic_optimizations_struct_buffer.ll | 11 -
llvm/test/CodeGen/AMDGPU/bf16.ll | 5 +-
.../buffer-fat-pointer-atomicrmw-fadd.ll | 9 -
.../buffer-fat-pointer-atomicrmw-fmax.ll | 12 -
.../buffer-fat-pointer-atomicrmw-fmin.ll | 12 -
.../test/CodeGen/AMDGPU/carryout-selection.ll | 2 +-
.../CodeGen/AMDGPU/combine-add-zext-xor.ll | 6 +-
.../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 1 +
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 7 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 36 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 40 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 40 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 40 +-
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 8 +
llvm/test/CodeGen/AMDGPU/fptrunc.ll | 7 +-
llvm/test/CodeGen/AMDGPU/fract-match.ll | 7 +-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 36 +-
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 40 +-
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 40 +-
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 40 +-
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 4 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 20 +-
.../AMDGPU/global_atomics_scan_fmax.ll | 15 +-
.../AMDGPU/global_atomics_scan_fmin.ll | 15 +-
.../AMDGPU/global_atomics_scan_fsub.ll | 20 +-
llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 7 +-
llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir | 648 +++++++++++-------
.../insert_waitcnt_for_precise_memory.ll | 1 -
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 20 +-
.../AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll | 18 +-
.../AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll | 6 +-
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 64 +-
.../AMDGPU/llvm.amdgcn.permlane.ptr.ll | 16 -
...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 1 -
....amdgcn.struct.buffer.load.format.v3f16.ll | 1 -
...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 2 -
...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 2 -
...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 2 -
...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 2 -
...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 2 -
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 15 +-
.../AMDGPU/llvm.amdgcn.writelane.ptr.ll | 11 -
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 15 +-
.../AMDGPU/load-constant-always-uniform.ll | 3 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 3 +-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 14 +-
.../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 2 -
.../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 2 -
.../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 2 -
llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 12 +-
.../AMDGPU/promote-constOffset-to-imm.ll | 16 +-
.../AMDGPU/pseudo-scalar-transcendental.ll | 3 +-
63 files changed, 682 insertions(+), 934 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 11773f0576f91..aa1e72c8bfe70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -16,6 +16,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/MC/MCRegister.h"
using namespace llvm;
@@ -236,6 +237,15 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
}
}
+ void advanceByNum(DelayType Type, unsigned Cycles, unsigned VALUNum) {
+ iterator Next;
+ for (auto I = begin(), E = end(); I != E; I = Next) {
+ Next = std::next(I);
+ if (I->second.VALUNum >= VALUNum && I->second.advance(Type, Cycles))
+ erase(I);
+ }
+ }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const TargetRegisterInfo *TRI) const {
if (empty()) {
@@ -340,11 +350,7 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
bool Changed = false;
MachineInstr *LastDelayAlu = nullptr;
- bool VALUSALUStall = false;
- MCRegUnit lastSgprWrite = 0;
- MCRegUnit longestWait = 0;
- unsigned deletedCyclesNum = 0;
-
+ MCRegUnit lastSGPRfromVALU = 0;
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
for (auto &MI : MBB.instrs()) {
@@ -359,6 +365,13 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
DelayType Type = getDelayType(MI.getDesc().TSFlags);
+ if (SII->isSALU(MI.getOpcode())) {
+ if (State.find(lastSGPRfromVALU) != State.end()){
+ State.advanceByNum(SALU, State[lastSGPRfromVALU].VALUCycles, State[lastSGPRfromVALU].VALUNum);
+ lastSGPRfromVALU = 0;
+ }
+ }
+
if (instructionWaitsForVALU(MI)) {
// Forget about all outstanding VALU delays.
// TODO: This is overkill since it also forgets about SALU delays.
@@ -376,50 +389,24 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
auto It = State.find(Unit);
if (It != State.end()) {
- if (SII->isSALU(MI.getOpcode()) &&
- AMDGPU::isSGPR(Op.getReg(), TRI) &&
- It->second.VALUCycles > 0) {
- deletedCyclesNum = It->second.VALUCycles;
- State.erase(Unit);
- VALUSALUStall = true;
- } else {
Delay.merge(It->second);
State.erase(Unit);
- }
}
}
}
}
- unsigned maxCycles = 0;
- unsigned lastWrite = 0;
- if (Type != OTHER) {
+
+ if (SII->isVALU(MI.getOpcode())) {
for (const auto &Op : MI.defs()) {
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
if (AMDGPU::isSGPR(Op.getReg(), TRI)) {
- maxCycles =
- (State.find(longestWait) == State.end())
- ? std::max(deletedCyclesNum, (unsigned)0)
- : std::max(State[longestWait].VALUCycles,
- State[longestWait].SALUCycles);
- lastWrite =
- (State.find(lastSgprWrite) == State.end())
- ? 0
- : std::max(State[lastSgprWrite].VALUCycles,
- State[lastSgprWrite].SALUCycles);
- if (maxCycles <= lastWrite)
- longestWait = lastSgprWrite;
- lastSgprWrite = Unit;
+ lastSGPRfromVALU = Unit;
+ break;
}
}
}
}
- if (VALUSALUStall) {
- State.advance(VALU, maxCycles);
- VALUSALUStall = false;
- lastSgprWrite = 0;
- longestWait = 0;
- }
if (Emit && !MI.isBundledWithPred()) {
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index e1397e7331d3c..c59f56d18d178 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -2854,7 +2854,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: flat_store_b32 v[0:1], v3
; GFX12-NEXT: s_endpgm
@@ -3842,7 +3842,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 6e55d7fdb5e95..00fd331162bdc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -361,7 +361,7 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
@@ -385,7 +385,7 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
@@ -2766,7 +2766,7 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
@@ -2790,7 +2790,7 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
@@ -3981,7 +3981,7 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
@@ -4005,7 +4005,7 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
@@ -4359,7 +4359,7 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400
; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
@@ -4383,7 +4383,7 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400
; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
@@ -4732,7 +4732,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
@@ -4756,7 +4756,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
@@ -5121,7 +5121,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
@@ -5145,7 +5145,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index ba2af13338be6..6111e9a460e6c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1072,15 +1072,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v2, v11
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
; GFX12-NEXT: s_wait_alu 0xf1fd
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -2436,48 +2435,45 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mov_b32_e32 v20, v22
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mov_b32_e32 v19, v22
; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
; GFX12-NEXT: v_mov_b32_e32 v20, v18
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11
; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12
@@ -2490,7 +2486,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
; GFX12-NEXT: v_mov_b32_e32 v14, v21
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
@@ -2504,7 +2499,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
@@ -2521,7 +2515,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 4b68f8a4bd194..8319e112f526e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -240,7 +240,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -454,7 +453,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -484,7 +482,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
@@ -517,7 +514,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -548,7 +544,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
@@ -887,7 +882,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -932,7 +926,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1292,7 +1285,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1339,7 +1331,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1977,7 +1968,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_endpgm
@@ -2010,7 +2000,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_endpgm
@@ -2349,7 +2338,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -2395,7 +2383,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 2bcce6c04c0bb..3701a96587757 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -215,7 +215,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -249,7 +248,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -285,7 +283,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -318,7 +315,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -929,7 +925,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -977,7 +972,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -1026,7 +1020,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -1074,7 +1067,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -1403,12 +1395,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
@@ -1532,12 +1523,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
@@ -2155,7 +2145,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
@@ -2199,7 +2189,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
@@ -2242,7 +2232,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_mov_b32 s2, -1
@@ -2282,7 +2272,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: s_mov_b32 s2, -1
@@ -3254,7 +3244,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc
@@ -3339,7 +3329,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10
; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
@@ -3396,7 +3386,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
@@ -3452,7 +3441,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7
; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8
; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
@@ -3540,7 +3528,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10
; GFX1232_DPP-NEXT: s_mov_b32 s2, s6
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
@@ -4080,7 +4068,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4117,7 +4105,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: v_mul_lo_u32 v0, s4, v0
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4156,7 +4144,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0
; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4194,7 +4182,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_mul_lo_u32 v0, s4, v0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4478,7 +4466,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4526,7 +4513,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4575,7 +4561,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4623,7 +4608,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4952,12 +4936,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
@@ -5081,12 +5064,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
@@ -5734,7 +5716,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v5, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -5779,7 +5761,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v5, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -5823,7 +5805,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v4, vcc
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -5866,7 +5848,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v4, vcc_lo
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -6836,7 +6818,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc
@@ -6921,7 +6903,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10
; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
@@ -6978,7 +6960,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
@@ -7034,7 +7015,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7
; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8
; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
@@ -7122,7 +7102,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1232_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1232_DPP-NEXT: v_readfirstlane_b32 s3, v9
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10
; GFX1232_DPP-NEXT: s_mov_b32 s2, s6
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index eb5353e928682..7c008a54e8e94 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -186,7 +186,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -215,7 +214,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -407,7 +405,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
@@ -440,7 +437,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1132-NEXT: s_endpgm
@@ -693,7 +689,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -735,7 +730,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -1909,12 +1903,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
@@ -1947,12 +1940,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1132-NEXT: v_readfirstlane_b32 s5, v1
; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
@@ -2747,7 +2739,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32 v7, vcc, s3, v9
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v8, vcc, s4, v10, vcc
@@ -2825,7 +2817,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v10
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo
@@ -3823,7 +3815,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -3856,7 +3848,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_endpgm
@@ -4109,7 +4100,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -4151,7 +4141,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -5352,7 +5341,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v5, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
@@ -5391,7 +5380,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v5, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
@@ -6186,7 +6175,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_sub_co_u32 v7, vcc, s3, v9
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v8, vcc, s4, v10, vcc
@@ -6264,7 +6253,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v10
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo
@@ -6522,7 +6511,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -6564,7 +6552,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_and_b32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -7885,7 +7872,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -7927,7 +7913,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -9247,7 +9232,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -9289,7 +9273,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -10609,7 +10592,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -10651,7 +10633,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_max_i32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -12439,7 +12420,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -12481,7 +12461,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_min_i32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -14269,7 +14248,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -14311,7 +14289,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_max_u32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -16086,7 +16063,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -16128,7 +16104,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_min_u32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 4ae08a0375c8c..0c624a83ae1be 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -576,12 +576,11 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_readlane_b32 s10, v1, 15
; GFX1132-NEXT: s_mov_b32 exec_lo, s9
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s9, -1
; GFX1132-NEXT: v_writelane_b32 v3, s10, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s9
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 0a06fe4ea949e..6a82dbeec5e2f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -239,7 +239,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -453,7 +452,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -483,7 +481,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
@@ -516,7 +513,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -547,7 +543,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
@@ -886,7 +881,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -931,7 +925,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1569,7 +1562,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_endpgm
@@ -1602,7 +1594,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_endpgm
@@ -1941,7 +1932,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1987,7 +1977,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index bc0bec4772e52..dd4c0b0625ea8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -247,7 +247,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -467,7 +466,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -498,7 +496,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
@@ -532,7 +529,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -563,7 +559,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
@@ -909,7 +904,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -954,7 +948,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1736,7 +1729,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_endpgm
@@ -1769,7 +1761,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_endpgm
@@ -2115,7 +2106,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -2161,7 +2151,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_wait_alu 0xf1ff
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 19a41d02240e0..7a8e44d640523 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2359,15 +2359,16 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index af67012718fd7..32f40b8015e87 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -383,7 +383,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -2370,7 +2369,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
; GFX12-NEXT: v_readfirstlane_b32 s7, v8
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -2401,7 +2399,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
; GFX12-NEXT: v_readfirstlane_b32 s7, v8
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -4113,7 +4110,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -4148,7 +4144,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -5555,7 +5550,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -5601,7 +5595,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -6728,7 +6721,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -9046,7 +9038,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 96b9964e39dc4..cb557c62c206c 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -375,7 +375,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -1583,7 +1582,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
; GFX12-NEXT: v_readfirstlane_b32 s7, v8
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -1616,7 +1614,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
; GFX12-NEXT: v_readfirstlane_b32 s7, v8
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -3214,7 +3211,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -3252,7 +3248,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -4676,7 +4671,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -4722,7 +4716,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -6003,7 +5996,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -6036,7 +6028,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -7059,7 +7050,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
@@ -7454,7 +7444,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -7505,7 +7494,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 1cf4fa6da1627..fea674a100b99 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -375,7 +375,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -1583,7 +1582,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
; GFX12-NEXT: v_readfirstlane_b32 s7, v8
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -1616,7 +1614,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_readfirstlane_b32 s6, v7
; GFX12-NEXT: v_readfirstlane_b32 s7, v8
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -3214,7 +3211,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -3252,7 +3248,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -4676,7 +4671,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -4722,7 +4716,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -6003,7 +5996,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -6036,7 +6028,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -7059,7 +7050,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
@@ -7454,7 +7444,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -7505,7 +7494,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 1bf38a4b51718..f975c830aa2aa 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2829,7 +2829,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_i32 s0, s0, s7
; GFX11-NEXT: v_sub_co_u32 v0, s7, s10, s12
; GFX11-NEXT: s_mul_i32 s6, s3, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_add_i32 s0, s0, s6
; GFX11-NEXT: v_sub_co_u32 v1, s12, v0, s2
; GFX11-NEXT: s_sub_i32 s6, s11, s0
diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
index 2839b0395f9cf..8dc781fdb67a7 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
@@ -39,10 +39,11 @@ define i32 @combine_add_zext_xor() {
; GFX1100-NEXT: .LBB0_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB0_4
; GFX1100-NEXT: .LBB0_2: ; %.a
@@ -118,10 +119,11 @@ define i32 @combine_sub_zext_xor() {
; GFX1100-NEXT: .LBB1_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB1_4
; GFX1100-NEXT: .LBB1_2: ; %.a
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index e0a83c7644b14..64c887d570e54 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -754,6 +754,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index fe60963534846..ae4fe9bb0dee3 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1708,24 +1708,23 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
; GFX11-NEXT: s_lshl_b32 s7, s2, 12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s6, v1
; GFX11-NEXT: s_or_b32 s3, s5, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s5, s3, 0x1000
; GFX11-NEXT: s_or_b32 s7, s3, s7
; GFX11-NEXT: s_lshr_b32 s6, s5, s6
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6
; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-NEXT: s_or_b32 s5, s6, s5
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-NEXT: s_cselect_b32 s5, s5, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s6, s5, 7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-NEXT: s_cselect_b32 s7, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s6, 3
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 02524bf71b074..d5a94273fed6c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -8410,13 +8410,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8709,13 +8708,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9286,13 +9284,12 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v4, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9574,13 +9571,12 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v4, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10299,13 +10295,12 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10601,13 +10596,12 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v4, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11238,14 +11232,13 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11593,14 +11586,13 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11949,14 +11941,13 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -12293,14 +12284,13 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -13531,14 +13521,13 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -13889,14 +13878,13 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 6ead5b93a0e39..38791bd3e4c8f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -6306,14 +6306,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6619,14 +6618,13 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7226,14 +7224,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7530,14 +7527,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8301,14 +8297,13 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8617,14 +8612,13 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9271,14 +9265,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9627,14 +9620,13 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10316,14 +10308,13 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10661,14 +10652,13 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11571,14 +11561,13 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11930,14 +11919,13 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -15282,7 +15270,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -15618,7 +15605,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -15961,7 +15947,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -16674,7 +16659,6 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 1fc9ed70e009c..7ffd7f7b641de 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -6306,14 +6306,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6619,14 +6618,13 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7226,14 +7224,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7530,14 +7527,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8301,14 +8297,13 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8617,14 +8612,13 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9271,14 +9265,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9627,14 +9620,13 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10316,14 +10308,13 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10661,14 +10652,13 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11571,14 +11561,13 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11930,14 +11919,13 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -15282,7 +15270,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -15618,7 +15605,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -15961,7 +15947,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -16674,7 +16659,6 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 8d2963ce7db35..e1cbd47a3efec 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -6107,13 +6107,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6406,13 +6405,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6983,13 +6981,12 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v4, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7271,13 +7268,12 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v4, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7996,13 +7992,12 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8298,13 +8293,12 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v4, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8935,14 +8929,13 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9290,14 +9283,13 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9977,14 +9969,13 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10321,14 +10312,13 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11228,14 +11218,13 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11586,14 +11575,13 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -14809,7 +14797,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -15145,7 +15132,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -15488,7 +15474,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -16201,7 +16186,6 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 07c9521e7646a..08c6dcc3b0ea9 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -15305,6 +15305,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15466,6 +15467,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15639,6 +15641,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15806,6 +15809,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15962,6 +15966,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16117,6 +16122,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -16283,6 +16289,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16444,6 +16451,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 647578b57c772..4ef5498c99fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -434,23 +434,22 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7
; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5
; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0
; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index fcaf427f6c010..0935438f1b951 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -116,7 +116,6 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) writeonly captures(
; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
; GFX12-NEXT: v_floor_f32_e32 v4, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
; GFX12-NEXT: global_store_b32 v[1:2], v4, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -2252,7 +2251,6 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture
; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -2389,7 +2387,6 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
; GFX12-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
; GFX12-NEXT: v_floor_f16_e32 v4, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
; GFX12-NEXT: global_store_b16 v[1:2], v4, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -2570,14 +2567,14 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX12-NEXT: v_fract_f16_e32 v6, v0
; GFX12-NEXT: v_floor_f16_e32 v5, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_fract_f16_e32 v4, v3
; GFX12-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204
; GFX12-NEXT: v_floor_f16_e32 v7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0
; GFX12-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_pack_b32_f16 v4, v5, v7
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 904ef8a4b6579..9f44564e35bfe 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -8557,13 +8557,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8908,13 +8907,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9586,13 +9584,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v4, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9924,13 +9921,12 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v4, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10775,13 +10771,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11129,13 +11124,12 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v4, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11866,14 +11860,13 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -12273,14 +12266,13 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -13061,14 +13053,13 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -13455,14 +13446,13 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -14488,14 +14478,13 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -14898,14 +14887,13 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index e8d73914ad302..a1f5a0289172f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -4780,14 +4780,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5145,14 +5144,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5853,14 +5851,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6207,14 +6204,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7104,14 +7100,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7472,14 +7467,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8227,14 +8221,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8636,14 +8629,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9428,14 +9420,13 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9824,14 +9815,13 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10863,14 +10853,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11275,14 +11264,13 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -15230,7 +15218,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -15617,7 +15604,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -16007,7 +15993,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -16809,7 +16794,6 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index c1c92906df250..b026ed6250ce4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -4780,14 +4780,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5145,14 +5144,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5853,14 +5851,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6207,14 +6204,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7104,14 +7100,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7472,14 +7467,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8227,14 +8221,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8636,14 +8629,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9428,14 +9420,13 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9824,14 +9815,13 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10863,14 +10853,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11275,14 +11264,13 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -15230,7 +15218,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -15617,7 +15604,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -16007,7 +15993,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -16809,7 +16794,6 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index c131921c83fff..838d3906fe2b8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -5525,13 +5525,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5876,13 +5875,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6554,13 +6552,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v4, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6892,13 +6889,12 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v4, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7743,13 +7739,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8097,13 +8092,12 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: global_load_b32 v4, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8834,14 +8828,13 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9241,14 +9234,13 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10029,14 +10021,13 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10423,14 +10414,13 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11456,14 +11446,13 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11866,14 +11855,13 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: global_load_b32 v3, v[0:1], off
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -15691,7 +15679,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -16078,7 +16065,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -16468,7 +16454,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
@@ -17270,7 +17255,6 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 492a30b67089c..888887d121eaf 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4737,7 +4737,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc
@@ -4857,7 +4857,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 15be44a335a1d..e2ca887389b1e 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -665,7 +665,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -714,7 +713,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1885,7 +1883,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1934,7 +1931,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3165,7 +3161,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3214,7 +3209,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3941,7 +3935,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3990,7 +3983,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5220,7 +5212,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5282,7 +5273,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -7226,7 +7216,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -8886,7 +8876,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -10319,7 +10309,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -11234,7 +11224,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -13282,7 +13272,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index a4410bb9ed2d0..8407e9336b9d3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -613,8 +613,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1652,8 +1652,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2691,8 +2691,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4597,11 +4597,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6120,11 +6119,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -8145,11 +8143,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 68d7dcc60506c..ea4e9cda2ab47 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -613,8 +613,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1652,8 +1652,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2691,8 +2691,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4597,11 +4597,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6120,11 +6119,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -8145,11 +8143,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 7126680525b87..fa0689b45257a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -725,7 +725,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -787,7 +786,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2057,7 +2055,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2119,7 +2116,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3389,7 +3385,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3451,7 +3446,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4217,7 +4211,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4279,7 +4272,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5548,7 +5540,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5610,7 +5601,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -7554,7 +7544,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -9213,7 +9203,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -10646,7 +10636,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -11561,7 +11551,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -13608,7 +13598,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 55a61e299768d..0c5b8b096d910 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -959,12 +959,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: s_add_i32 s3, s3, 1
; GFX11-NEXT: v_mov_b32_e32 v3, s5
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
; GFX11-NEXT: s_cbranch_scc0 .LBB6_1
@@ -1094,16 +1094,17 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s5, s5, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, s5, v2
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
; GFX11-NEXT: s_add_i32 s3, s3, 1
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
; GFX11-NEXT: v_mov_b32_e32 v3, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
; GFX11-NEXT: s_cbranch_scc0 .LBB7_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
index 86219940ebcd9..c287fb3614496 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -1,15 +1,390 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+
+--- |
+
+ define void @valu_dep_1() {
+ ; CHECK-LABEL: valu_dep_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @valu_dep_2() {
+ ; CHECK-LABEL: valu_dep_2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @valu_dep_3() {
+ ; CHECK-LABEL: valu_dep_3:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @valu_dep_4() {
+ ; CHECK-LABEL: valu_dep_4:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @valu_dep_5() {
+ ; CHECK-LABEL: valu_dep_5:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
+ ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @trans32_dep_1() {
+ ; CHECK-LABEL: trans32_dep_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @trans32_dep_2() {
+ ; CHECK-LABEL: trans32_dep_2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @trans32_dep_3() {
+ ; CHECK-LABEL: trans32_dep_3:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: v_exp_f32_e32 v2, v2
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @trans32_dep_4() {
+ ; CHECK-LABEL: trans32_dep_4:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: v_exp_f32_e32 v2, v2
+ ; CHECK-NEXT: v_exp_f32_e32 v3, v3
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @salu_cycle_1() {
+ ; CHECK-LABEL: salu_cycle_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_mov_b32 s0, 0
+ ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+ ret void
+ }
+
+ define void @salu_cycle_2() {
+ ; CHECK-LABEL: salu_cycle_2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_mov_b32 s0, 0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+ ret void
+ }
+
+ define void @valu_dep_1_same_trans32_dep_1() {
+ ; CHECK-LABEL: valu_dep_1_same_trans32_dep_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
+ ret void
+ }
+
+ define void @trans32_dep_1_only() {
+ ; CHECK-LABEL: trans32_dep_1_only:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
+ ret void
+ }
+
+ define void @valu_dep_1_same_salu_cycle_1() {
+ ; CHECK-LABEL: valu_dep_1_same_salu_cycle_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_mov_b32 s0, 0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+ ret void
+ }
+
+ define void @valu_dep_1_next_valu_dep_1() {
+ ; CHECK-LABEL: valu_dep_1_next_valu_dep_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @valu_dep_2_next_valu_dep_2() {
+ ; CHECK-LABEL: valu_dep_2_next_valu_dep_2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ret void
+ }
+
+ define void @valu_dep_1_no_next_1() {
+ ; CHECK-LABEL: valu_dep_1_no_next_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0
+ ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0
+ ret void
+ }
+
+ define void @valu_dep_1_no_next_2() {
+ ; CHECK-LABEL: valu_dep_1_no_next_2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+ ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @implicit_cmp_cndmask() {
+ ; CHECK-LABEL: implicit_cmp_cndmask:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1
+ ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc
+ ret void
+ }
+
+ define void @explicit_cmp_cndmask() {
+ ; CHECK-LABEL: explicit_cmp_cndmask:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
+ ret void
+ }
+
+ define void @implicit_addc_addc() {
+ ; CHECK-LABEL: implicit_addc_addc:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc
+ ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
+ ret void
+ }
+
+ define void @explicit_addc_addc() {
+ ; CHECK-LABEL: explicit_addc_addc:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0
+ ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
+ ret void
+ }
+
+ define void @valu_dep_3_bundle() {
+ ; CHECK-LABEL: valu_dep_3_bundle:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @if() {
+ ; CHECK-LABEL: if:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_cbranch_vccz .LBB23_2
+ ; CHECK-NEXT: ; %bb.1:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: .LBB23_2:
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @else() {
+ ; CHECK-LABEL: else:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_cbranch_vccz .LBB24_2
+ ; CHECK-NEXT: ; %bb.1:
+ ; CHECK-NEXT: s_branch .LBB24_3
+ ; CHECK-NEXT: .LBB24_2:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: .LBB24_3:
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @if_else() {
+ ; CHECK-LABEL: if_else:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_cbranch_vccz .LBB25_2
+ ; CHECK-NEXT: ; %bb.1:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_branch .LBB25_3
+ ; CHECK-NEXT: .LBB25_2:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1
+ ; CHECK-NEXT: .LBB25_3:
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @loop_1() {
+ ; CHECK-LABEL: loop_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: .LBB26_1: ; =>This Inner Loop Header: Depth=1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0
+ ; CHECK-NEXT: s_cbranch_vccz .LBB26_1
+ ; CHECK-NEXT: ; %bb.2:
+ ret void
+ }
+
+ define void @loop_2() {
+ ; CHECK-LABEL: loop_2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_cbranch_vccz .LBB27_1
+ ; CHECK-NEXT: ; %bb.2:
+ ret void
+ }
+
+ define void @sendmsg_rtn() {
+ ; CHECK-LABEL: sendmsg_rtn:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+ ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+ ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+ ; CHECK-NEXT: s_add_u32 s0, s0, s0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @flat_load() {
+ ; CHECK-LABEL: flat_load:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
+ ; CHECK-NEXT: v_mov_b32_e32 v2, 0
+ ; CHECK-NEXT: flat_load_b32 v0, v[0:1]
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2
+ ret void
+ }
+
+ define void @waitcnt_depctr() {
+ ; CHECK-LABEL: waitcnt_depctr:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+ ; CHECK-NEXT: s_waitcnt_depctr 0xfff
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @writelane1() {
+ ; CHECK-LABEL: writelane1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 0
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 1
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 2
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
+ ret void
+ }
+
+ define void @writelane2() {
+ ; CHECK-LABEL: writelane2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ret void
+ }
+
+ define void @delay_alu() {
+ ; CHECK-LABEL: delay_alu:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ret void
+ }
+
+ define void @redundant_delay_alu() {
+ ; CHECK-LABEL: redundant_delay_alu:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s6, s7
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ret void
+ }
+
+ define void @redundant_delay_alu_2() {
+ ; CHECK-LABEL: redundant_delay_alu_2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ret void;
+ }
+...
+
---
name: valu_dep_1
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_1:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
@@ -18,12 +393,6 @@ body: |
name: valu_dep_2
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_2:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
@@ -33,13 +402,6 @@ body: |
name: valu_dep_3
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_3:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
- ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
@@ -50,14 +412,6 @@ body: |
name: valu_dep_4
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_4:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
- ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
- ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
@@ -71,14 +425,6 @@ body: |
name: valu_dep_5
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_5:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
- ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
- ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
- ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
@@ -91,11 +437,6 @@ body: |
name: trans32_dep_1
body: |
bb.0:
- ; CHECK-LABEL: {{^}}trans32_dep_1:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_exp_f32_e32 v0, v0
- ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
@@ -104,12 +445,6 @@ body: |
name: trans32_dep_2
body: |
bb.0:
- ; CHECK-LABEL: {{^}}trans32_dep_2:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_exp_f32_e32 v0, v0
- ; CHECK-NEXT: v_exp_f32_e32 v1, v1
- ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
@@ -119,13 +454,6 @@ body: |
name: trans32_dep_3
body: |
bb.0:
- ; CHECK-LABEL: {{^}}trans32_dep_3:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_exp_f32_e32 v0, v0
- ; CHECK-NEXT: v_exp_f32_e32 v1, v1
- ; CHECK-NEXT: v_exp_f32_e32 v2, v2
- ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
@@ -138,13 +466,6 @@ body: |
name: trans32_dep_4
body: |
bb.0:
- ; CHECK-LABEL: {{^}}trans32_dep_4:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_exp_f32_e32 v0, v0
- ; CHECK-NEXT: v_exp_f32_e32 v1, v1
- ; CHECK-NEXT: v_exp_f32_e32 v2, v2
- ; CHECK-NEXT: v_exp_f32_e32 v3, v3
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
@@ -156,11 +477,6 @@ body: |
name: salu_cycle_1
body: |
bb.0:
- ; CHECK-LABEL: {{^}}salu_cycle_1:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: s_mov_b32 s0, 0
- ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
$sgpr0 = S_MOV_B32 0
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
...
@@ -171,11 +487,6 @@ body: |
name: salu_cycle_2
body: |
bb.0:
- ; CHECK-LABEL: {{^}}salu_cycle_2:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: s_mov_b32 s0, 0
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
$sgpr0 = S_MOV_B32 0
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
@@ -185,12 +496,6 @@ body: |
name: valu_dep_1_same_trans32_dep_1
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_exp_f32_e32 v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
- ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
$vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
@@ -202,12 +507,6 @@ body: |
name: trans32_dep_1_only
body: |
bb.0:
- ; CHECK-LABEL: {{^}}trans32_dep_1_only:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_exp_f32_e32 v1, v1
- ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
@@ -217,12 +516,6 @@ body: |
name: valu_dep_1_same_salu_cycle_1
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: s_mov_b32 s0, 0
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$sgpr0 = S_MOV_B32 0
$vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
@@ -232,12 +525,6 @@ body: |
name: valu_dep_1_next_valu_dep_1
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
@@ -247,13 +534,6 @@ body: |
name: valu_dep_2_next_valu_dep_2
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
@@ -267,12 +547,6 @@ body: |
name: valu_dep_1_no_next_1
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_1_no_next_1:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0
- ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0
$vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
$vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
@@ -285,13 +559,6 @@ body: |
name: valu_dep_1_no_next_2
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_1_no_next_2:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
- ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
- ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
$vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
$vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
$vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
@@ -305,10 +572,6 @@ body: |
name: implicit_cmp_cndmask
body: |
bb.0:
- ; CHECK-LABEL: {{^}}implicit_cmp_cndmask:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1
- ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc
implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
$vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec
...
@@ -318,11 +581,6 @@ body: |
name: explicit_cmp_cndmask
body: |
bb.0:
- ; CHECK-LABEL: {{^}}explicit_cmp_cndmask:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
$sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec
$vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec
...
@@ -331,10 +589,6 @@ body: |
name: implicit_addc_addc
body: |
bb.0:
- ; CHECK-LABEL: {{^}}implicit_addc_addc:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc
- ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
$vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
$vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
...
@@ -343,10 +597,6 @@ body: |
name: explicit_addc_addc
body: |
bb.0:
- ; CHECK-LABEL: {{^}}explicit_addc_addc:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0
- ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
$vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
$vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
...
@@ -355,13 +605,6 @@ body: |
name: valu_dep_3_bundle
body: |
bb.0:
- ; CHECK-LABEL: {{^}}valu_dep_3_bundle:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
- ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
BUNDLE {
$vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
@@ -374,14 +617,6 @@ body: |
name: if
body: |
bb.0:
- ; CHECK-LABEL: {{^}}if:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: s_cbranch_vccz .LBB23_2
- ; CHECK-NEXT: %bb.1:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: .LBB23_2:
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
S_CBRANCH_VCCZ %bb.2, implicit $vcc
bb.1:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
@@ -393,16 +628,6 @@ body: |
name: else
body: |
bb.0:
- ; CHECK-LABEL: {{^}}else:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: s_cbranch_vccz .LBB24_2
- ; CHECK-NEXT: %bb.1
- ; CHECK-NEXT: s_branch .LBB24_3
- ; CHECK-NEXT: .LBB24_2:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: .LBB24_3:
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
S_CBRANCH_VCCZ %bb.2, implicit $vcc
bb.1:
S_BRANCH %bb.3
@@ -416,18 +641,6 @@ body: |
name: if_else
body: |
bb.0:
- ; CHECK-LABEL: {{^}}if_else:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: s_cbranch_vccz .LBB25_2
- ; CHECK-NEXT: %bb.1:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: s_branch .LBB25_3
- ; CHECK-NEXT: .LBB25_2:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1
- ; CHECK-NEXT: .LBB25_3:
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
S_CBRANCH_VCCZ %bb.2, implicit $vcc
bb.1:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
@@ -444,13 +657,6 @@ body: |
name: loop_1
body: |
bb.0:
- ; CHECK-LABEL: {{^}}loop_1:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: .LBB26_1:
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0
- ; CHECK-NEXT: s_cbranch_vccz .LBB26_1
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
bb.1:
$vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
@@ -463,12 +669,6 @@ body: |
name: loop_2
body: |
bb.0:
- ; CHECK-LABEL: {{^}}loop_2:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: .LBB27_1:
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
- ; CHECK-NEXT: s_cbranch_vccz .LBB27_1
bb.1:
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
S_CBRANCH_VCCZ %bb.1, implicit $vcc
@@ -481,13 +681,6 @@ body: |
name: sendmsg_rtn
body: |
bb.0:
- ; CHECK-LABEL: {{^}}sendmsg_rtn:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_mov_b32_e32 v0, 0
- ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
- ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
- ; CHECK-NEXT: s_add_u32 s0, s0, s0
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_SENDMSG_RTN_B32 128
$sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc
@@ -500,13 +693,6 @@ body: |
name: flat_load
body: |
bb.0:
- ; CHECK-LABEL: {{^}}flat_load:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_mov_b32_e32 v0, 0
- ; CHECK-NEXT: v_mov_b32_e32 v1, 0
- ; CHECK-NEXT: v_mov_b32_e32 v2, 0
- ; CHECK-NEXT: flat_load_b32 v0, v[0:1]
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
@@ -520,11 +706,6 @@ body: |
name: waitcnt_depctr
body: |
bb.0:
- ; CHECK-LABEL: {{^}}waitcnt_depctr:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_mov_b32_e32 v0, 0
- ; CHECK-NEXT: s_waitcnt_depctr 0xfff
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT_DEPCTR 4095
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
@@ -535,12 +716,6 @@ body: |
name: writelane1
body: |
bb.0:
- ; CHECK-LABEL: {{^}}writelane1:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_writelane_b32 v0, s0, 0
- ; CHECK-NEXT: v_writelane_b32 v0, s0, 1
- ; CHECK-NEXT: v_writelane_b32 v0, s0, 2
- ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
$vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0
$vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0
$vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0
@@ -552,87 +727,38 @@ body: |
name: writelane2
body: |
bb.0:
- ; CHECK-LABEL: {{^}}writelane2:
- ; CHECK: %bb.0:
- ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
- ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
- ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
-
-# Check if s_delay_alu is added
----
-name: redundant_delay_alu_1
-body: |
- bb.0:
- ; CHECK-LABEL: redundant_delay_alu_1:
- ; CHECK: ; %bb.0:
- ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1
- ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
- ; CHECK-NEXT: s_or_b32 s0, s0, s1
- ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
- $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec
- $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
- $sgpr0= S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
- $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
-...
-
# Check if s_delay_alu is added
---
name: delay_alu
body: |
bb.0:
- ; CHECK-LABEL: delay_alu:
- ; CHECK: ; %bb.0:
- ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
- ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
- ; CHECK-NEXT: s_or_b32 s0, s0, s1
- ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
- $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
$sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
...
-
-# Check if reduntant delay_alu is removed
+# Check if redundant delay_alu is removed
---
-name: redundant_delay_alu_2
+name: redundant_delay_alu
body: |
bb.0:
- ; CHECK-LABEL: redundant_delay_alu_2:
- ; CHECK: ; %bb.0:
- ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
- ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
- ; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s6, s7
- ; CHECK-NEXT: s_or_b32 s0, s0, s1
- ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
$sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
$sgpr6_sgpr7 = V_CMP_EQ_U32_e64 $sgpr6, $sgpr7, implicit $exec
$sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
...
-
-# Check if reduntant delay_alu is removed
+# Check if redundant delay_alu is removed
---
-name: perserved_delay
+name: redundant_delay_alu_2
body: |
bb.0:
- ; CHECK-LABEL: perserved_delay:
- ; CHECK: ; %bb.0:
- ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
- ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
- ; CHECK-NEXT: s_or_b32 s0, s0, s1
- ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
- ; CHECK-NEXT: s_or_b32 s2, s0, s0
- ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
- liveins : $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
- $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec
$sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
- $sgpr2 = S_OR_B32 $sgpr0, $sgpr0, implicit-def $scc
$vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
...
-## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-# CHECK: {{.*}}
+
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 2f37b45651234..efd4a0044c660 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -1060,7 +1060,6 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: v_readfirstlane_b32 s2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index c0c0d3ded117d..a6be79135ec5d 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5125,22 +5125,21 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v5, v2
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v1, v7, v6
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v4, v3
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v2, 0
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_add3_u32 v4, v4, v6, v2
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v3, v1
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v5, v4, v0
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
@@ -5164,7 +5163,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1]
@@ -5176,7 +5175,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v4, v2
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v0, v5, vcc_lo
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v3, v[1:2]
; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
@@ -5185,7 +5184,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v4
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v2, v[3:4]
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v5, v7, v4
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v6, v[0:1]
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
@@ -5978,19 +5977,18 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v12, v9, v4
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v13, v8, v5
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v14, v11, v6
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v15, v10, v7
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v6, 0
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1200-SDAG-NEXT: v_add3_u32 v12, v1, v13, v12
; GFX1200-SDAG-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1200-SDAG-NEXT: v_add3_u32 v13, v3, v15, v14
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v12, v9, vcc_lo
; GFX1200-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v2, v10
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
@@ -6040,7 +6038,6 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-GISEL-NEXT: v_add_co_u32 v9, s0, v2, 1
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v1, vcc_lo
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v8, v4
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v9, v6
; GFX1200-GISEL-NEXT: s_wait_alu 0xf1ff
@@ -6078,7 +6075,6 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v8
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v9
; GFX1200-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v7, 1
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 110192ecefe55..1e2bf8256321d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -121,12 +121,11 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; GISEL12-NEXT: s_or_saveexec_b32 s4, -1
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
; GISEL12-NEXT: s_wait_alu 0xf1ff
; GISEL12-NEXT: v_mov_b32_e32 v0, s8
; GISEL12-NEXT: s_mov_b32 exec_lo, s4
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10
; GISEL12-NEXT: ; %bb.2: ; %tail
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -150,7 +149,7 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal
; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
-; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10
@@ -241,12 +240,11 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; GISEL12-NEXT: s_or_saveexec_b32 s4, -1
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
; GISEL12-NEXT: s_wait_alu 0xf1ff
; GISEL12-NEXT: v_mov_b32_e32 v0, s8
; GISEL12-NEXT: s_mov_b32 exec_lo, s4
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v12
; GISEL12-NEXT: ; %bb.2: ; %tail
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -269,7 +267,7 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg %
; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4
-; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v12
@@ -363,13 +361,12 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call
; GISEL12-NEXT: s_or_saveexec_b32 s8, -1
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v1, s8
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GISEL12-NEXT: v_cmp_ne_u32_e64 s9, 0, v0
; GISEL12-NEXT: s_wait_alu 0xf1ff
; GISEL12-NEXT: v_mov_b32_e32 v0, s9
; GISEL12-NEXT: s_mov_b32 exec_lo, s8
; GISEL12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v13, v1
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GISEL12-NEXT: v_mov_b32_e32 v11, v0
; GISEL12-NEXT: s_or_b32 s4, vcc_lo, s4
; GISEL12-NEXT: s_wait_alu 0xfffe
@@ -613,12 +610,11 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee,
; GISEL12-NEXT: s_or_saveexec_b32 s4, -1
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: v_cndmask_b32_e64 v13, 0x47, v12, s4
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v13
; GISEL12-NEXT: s_wait_alu 0xf1ff
; GISEL12-NEXT: v_mov_b32_e32 v13, s8
; GISEL12-NEXT: s_mov_b32 exec_lo, s4
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL12-NEXT: v_dual_mov_b32 v11, v13 :: v_dual_add_nc_u32 v10, 42, v12
; GISEL12-NEXT: ;;#ASMSTART
; GISEL12-NEXT: ; use v0-7
@@ -646,7 +642,7 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee,
; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: v_cndmask_b32_e64 v13, 0x47, v12, s4
-; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v13
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v12
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
index e0a5d397bded4..baa904878310b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll
@@ -26,11 +26,10 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
; GISEL12-NEXT: s_wait_alu 0xf1ff
; GISEL12-NEXT: v_mov_b32_e32 v0, s12
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GISEL12-NEXT: v_mov_b32_e32 v1, s13
; GISEL12-NEXT: s_mov_b64 exec, s[10:11]
; GISEL12-NEXT: v_mov_b32_e32 v11, v0
@@ -61,12 +60,11 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64
; DAGISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11]
-; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0
; DAGISEL12-NEXT: s_mov_b64 exec, s[10:11]
; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12
; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13
-; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; DAGISEL12-NEXT: v_mov_b32_e32 v12, s13
; DAGISEL12-NEXT: ; %bb.2: ; %tail
; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 076cf09678b57..db557ff23c085 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -801,13 +801,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
@@ -834,14 +833,13 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-SDAG-NEXT: s_endpgm
@@ -858,7 +856,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-GISEL-NEXT: s_endpgm
@@ -979,13 +976,12 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
@@ -1012,14 +1008,13 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-SDAG-NEXT: s_endpgm
@@ -1036,7 +1031,6 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-GISEL-NEXT: s_endpgm
@@ -1173,7 +1167,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
@@ -1323,7 +1317,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
@@ -1476,7 +1470,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
@@ -1664,7 +1658,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
@@ -3366,13 +3360,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
@@ -3399,14 +3392,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-SDAG-NEXT: s_endpgm
@@ -3423,7 +3415,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-GISEL-NEXT: s_endpgm
@@ -3471,13 +3462,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
@@ -3504,14 +3494,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-SDAG-NEXT: s_endpgm
@@ -3528,7 +3517,6 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-GISEL-NEXT: s_endpgm
@@ -3738,7 +3726,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
@@ -3816,7 +3804,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa
; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
@@ -4041,7 +4029,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
@@ -4123,7 +4111,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa
; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
@@ -8571,7 +8559,6 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -8610,7 +8597,6 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -8649,7 +8635,6 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1,
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -8688,7 +8673,6 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -8727,7 +8711,6 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -8766,7 +8749,6 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -8805,7 +8787,6 @@ define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %sr
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -8844,7 +8825,6 @@ define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %s
; GFX12-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -8906,7 +8886,6 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -8922,7 +8901,6 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -8985,7 +8963,6 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -9001,7 +8978,6 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -9090,7 +9066,6 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
@@ -9113,7 +9088,6 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
@@ -9209,7 +9183,6 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
@@ -9232,7 +9205,6 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
@@ -9310,7 +9282,6 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
@@ -9328,7 +9299,6 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
@@ -9401,7 +9371,6 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
@@ -9419,7 +9388,6 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6
; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7
; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
index 10c000095fe3d..8eab7e2fc62fa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
@@ -35,7 +35,6 @@ define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %s
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -77,7 +76,6 @@ define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -130,7 +128,6 @@ define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
@@ -189,7 +186,6 @@ define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1
@@ -235,7 +231,6 @@ define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -274,7 +269,6 @@ define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -317,7 +311,6 @@ define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %sr
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
@@ -362,7 +355,6 @@ define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %s
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
@@ -403,7 +395,6 @@ define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -442,7 +433,6 @@ define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -485,7 +475,6 @@ define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %sr
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
@@ -530,7 +519,6 @@ define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %s
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
@@ -571,7 +559,6 @@ define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -610,7 +597,6 @@ define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -653,7 +639,6 @@ define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %sr
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
@@ -698,7 +683,6 @@ define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %s
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5
; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6
; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
index 8b6ba1a3cc094..6c032ed061544 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -80,7 +80,6 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__
; GFX12-NEXT: v_readfirstlane_b32 s3, v6
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index ea8703df080d9..930aa6eeb62cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -123,7 +123,6 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
index 148a5ba75d98b..2c9f9a6ca4d55 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -49,7 +49,6 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr
; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
; GFX1200-NEXT: s_wait_alu 0xf1ff
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -89,7 +88,6 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp
; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
; GFX1200-NEXT: s_wait_alu 0xf1ff
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index 746b8791c39f9..a86ad8ede2f2c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -273,7 +273,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
; GFX1200-NEXT: s_wait_alu 0xf1ff
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -411,7 +410,6 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
; GFX1200-NEXT: s_wait_alu 0xf1ff
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 71c63bfd69734..a3bdcbe17cc76 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -218,7 +218,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
; GFX1200-NEXT: s_wait_alu 0xf1ff
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -327,7 +326,6 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
; GFX1200-NEXT: s_wait_alu 0xf1ff
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
index e3889ab8f5a21..db6e0ad670747 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -456,7 +456,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX12-NEXT: v_readfirstlane_b32 s6, v3
; GFX12-NEXT: v_readfirstlane_b32 s7, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
@@ -611,7 +610,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: v_readfirstlane_b32 s3, v7
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
index f001bf97fcd9e..eef6bb7b0788f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -456,7 +456,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX12-NEXT: v_readfirstlane_b32 s6, v3
; GFX12-NEXT: v_readfirstlane_b32 s7, v4
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
@@ -611,7 +610,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: v_readfirstlane_b32 s3, v7
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 7342c366799e9..0baec5383160d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -936,7 +936,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v0
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX1100-SDAG-NEXT: s_mov_b32 s2, 0x40280000
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100-SDAG-NEXT: v_writelane_b32 v1, s2, s3
; GFX1100-SDAG-NEXT: v_writelane_b32 v0, 0, s3
; GFX1100-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -1963,7 +1963,6 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1999,7 +1998,6 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2041,7 +2039,6 @@ define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1)
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2077,7 +2074,6 @@ define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1)
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2119,7 +2115,6 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2155,7 +2150,6 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2197,7 +2191,6 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2233,7 +2226,6 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2275,7 +2267,6 @@ define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %s
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2311,7 +2302,6 @@ define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %s
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off
; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2358,7 +2348,6 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1
; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1
; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off
@@ -2400,7 +2389,6 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4
; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3
; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1
; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1
; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off
@@ -2483,7 +2471,6 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v7
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v6
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s0, s1
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
index edc1afe410a63..90f5369ab98ca 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll
@@ -40,7 +40,6 @@ define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1
; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1
; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off
@@ -118,7 +117,6 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v7
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v6
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1
@@ -168,7 +166,6 @@ define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -219,10 +216,8 @@ define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1
; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1
; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -264,7 +259,6 @@ define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -315,10 +309,8 @@ define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1
; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1
; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -360,7 +352,6 @@ define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1
; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -411,10 +402,8 @@ define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3
; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2
; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1
; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1
; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off
; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index e828a12442fb8..76ca99059d58d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -936,13 +936,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-NEXT: s_lshr_b32 s2, s1, 16
; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 9a2ef15737308..c472ee39a41e4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -749,13 +749,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-NEXT: s_lshr_b32 s2, s1, 16
; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 8f7456b788f81..922d2e86ea6b2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -110,18 +110,17 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v1, v5
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_add3_u32 v1, v1, v5, v7
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
@@ -288,22 +287,21 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0
; GFX12-NEXT: v_mad_co_i64_i32 v[10:11], null, v5, v3, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v1, v6
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
@@ -315,13 +313,12 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
index b88266981a253..006da0cd18867 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
@@ -33,12 +33,11 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_or3_b32 v2, v2, s1, s2
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index af4fb0c4d6f6e..a9240eff8e691 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -4904,11 +4904,12 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10002
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_and_b32 s2, s2, 1
; GFX12-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 50b6ad9f0cb37..03567c8dcbbc4 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -7096,8 +7096,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -7211,7 +7211,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: .LBB28_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s7, exec_lo
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
@@ -7246,10 +7246,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
; GFX11-NEXT: s_lshl_b32 s7, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc1 .LBB28_5
@@ -7959,8 +7958,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -8069,7 +8068,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1
; GFX11-NEXT: .LBB29_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s7, exec_lo
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
@@ -8103,10 +8102,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
; GFX11-NEXT: s_lshl_b32 s7, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc1 .LBB29_5
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 681c07db327dc..a8f4f636949d8 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -6361,7 +6361,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
@@ -6726,7 +6725,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index bf56496e98690..9bd831fc2c130 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -6361,7 +6361,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
@@ -6726,7 +6725,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index fffdc16e1a501..1201f96739af5 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -7157,7 +7157,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
@@ -7522,7 +7521,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index d4f75051b04d4..34f17bdde2864 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -382,7 +382,7 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v13, v14, v[8:9]
; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
; GFX12-NEXT: s_wait_alu 0xfffd
@@ -1158,7 +1158,7 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
@@ -1249,11 +1249,11 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
; GFX12-NEXT: s_wait_alu 0xfffd
@@ -1798,11 +1798,11 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index a1197aeace86f..84f5be5fd6d36 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -878,11 +878,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v14, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(9)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v10, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v7, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v8, v3, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -892,27 +892,27 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v2, s0, v19, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v20, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v17, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v18, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v15, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v16, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v21, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v22, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v23, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v24, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v25, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v26, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v27, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v28, v3, vcc_lo
; GFX11-NEXT: s_cbranch_scc0 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index aaac4212e003b..b05a62b3fcd37 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -217,7 +217,6 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-SDAG-NEXT: s_mul_f32 s0, s2, 0x37800000
; GFX12-SDAG-NEXT: v_cmp_class_f32_e64 s3, s1, 0x260
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_cselect_b32 s0, s0, s2
; GFX12-SDAG-NEXT: s_and_b32 s2, s3, exec_lo
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
@@ -251,7 +250,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1
>From 9ed220f122ceb70229d6d5f23954d11d3f3567d1 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 21 Feb 2025 11:46:33 +0100
Subject: [PATCH 4/7] refactor
---
llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index aa1e72c8bfe70..f0e0d6566e844 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -366,8 +366,10 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
DelayType Type = getDelayType(MI.getDesc().TSFlags);
if (SII->isSALU(MI.getOpcode())) {
- if (State.find(lastSGPRfromVALU) != State.end()){
- State.advanceByNum(SALU, State[lastSGPRfromVALU].VALUCycles, State[lastSGPRfromVALU].VALUNum);
+ auto It = State.find(lastSGPRfromVALU);
+ if (It != State.end()) {
+ DelayInfo Info = It->getSecond();
+ State.advanceByNum(VALU, Info.VALUCycles, Info.VALUNum);
lastSGPRfromVALU = 0;
}
}
>From ce7a983e5d86f68ae6d692d05af791f5e81333a0 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 21 Feb 2025 12:03:41 +0100
Subject: [PATCH 5/7] minor change
---
llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index f0e0d6566e844..ec491435a228d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -400,8 +400,9 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
if (SII->isVALU(MI.getOpcode())) {
for (const auto &Op : MI.defs()) {
- for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
- if (AMDGPU::isSGPR(Op.getReg(), TRI)) {
+ Register Reg = Op.getReg();
+ for (MCRegUnit Unit : TRI->regunits(Reg)) {
+ if (AMDGPU::isSGPR(Reg, TRI)) {
lastSGPRfromVALU = Unit;
break;
}
>From d76f4a1c5d81740c4b7264a85db4041b5f9d5872 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 21 Feb 2025 12:59:05 +0100
Subject: [PATCH 6/7] added function that tests if an instruction waits for
VA_SDST=0
---
.../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index ec491435a228d..6539002ff4aff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -14,9 +14,13 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "SIInstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/MC/MCRegister.h"
+#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
@@ -57,6 +61,19 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
return false;
}
+ static bool instructionWaitsForSALUWrites(const MachineInstr &MI) {
+ // These instruction types wait for VA_SDST==0 before issuing.
+ // S_CBRANCH_EXECZ and S_CBRANCH_VCCZ are covered by SALU flag
+ const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::EXP |
+ SIInstrFlags::DS | SIInstrFlags::SMRD |
+ SIInstrFlags::MIMG | SIInstrFlags::VIMAGE |
+ SIInstrFlags::VSAMPLE;
+
+ if (MI.getDesc().TSFlags & VA_SDST_0)
+ return true;
+ return false;
+ }
+
// Types of delay that can be encoded in an s_delay_alu instruction.
enum DelayType { VALU, TRANS, SALU, OTHER };
@@ -365,7 +382,7 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
DelayType Type = getDelayType(MI.getDesc().TSFlags);
- if (SII->isSALU(MI.getOpcode())) {
+ if (instructionWaitsForSALUWrites(MI)) {
auto It = State.find(lastSGPRfromVALU);
if (It != State.end()) {
DelayInfo Info = It->getSecond();
>From b5b1c9e32f3485fae779a655a280f0ac870db680 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 21 Feb 2025 16:45:41 +0100
Subject: [PATCH 7/7] eliminated double VALUNum decrement in advanceByNum
function
---
.../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 15 ++-
.../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 2 -
.../CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll | 96 +++++++++----------
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 18 ++--
.../atomic_optimizations_global_pointer.ll | 26 ++---
.../atomic_optimizations_local_pointer.ll | 13 +--
.../test/CodeGen/AMDGPU/carryout-selection.ll | 4 +-
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 8 --
.../AMDGPU/global_atomics_scan_fadd.ll | 10 +-
.../AMDGPU/global_atomics_scan_fsub.ll | 10 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 24 +++--
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 6 +-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 4 +-
llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 8 +-
.../AMDGPU/promote-constOffset-to-imm.ll | 12 +--
.../AMDGPU/pseudo-scalar-transcendental.ll | 1 +
16 files changed, 110 insertions(+), 147 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 6539002ff4aff..33512659c47f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -61,16 +61,17 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
return false;
}
- static bool instructionWaitsForSALUWrites(const MachineInstr &MI) {
+ static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
// These instruction types wait for VA_SDST==0 before issuing.
// S_CBRANCH_EXECZ and S_CBRANCH_VCCZ are covered by SALU flag
const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::EXP |
SIInstrFlags::DS | SIInstrFlags::SMRD |
SIInstrFlags::MIMG | SIInstrFlags::VIMAGE |
SIInstrFlags::VSAMPLE;
-
+
if (MI.getDesc().TSFlags & VA_SDST_0)
return true;
+
return false;
}
@@ -254,12 +255,15 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
}
}
- void advanceByNum(DelayType Type, unsigned Cycles, unsigned VALUNum) {
+ void advanceByNum(DelayType Type, unsigned Cycles, unsigned SGPRWriteVALUNum) {
iterator Next;
for (auto I = begin(), E = end(); I != E; I = Next) {
Next = std::next(I);
- if (I->second.VALUNum >= VALUNum && I->second.advance(Type, Cycles))
+ if (I->second.VALUNum >= SGPRWriteVALUNum && I->second.VALUCycles > 0){
erase(I);
+
+
+ }
}
}
@@ -382,7 +386,7 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
DelayType Type = getDelayType(MI.getDesc().TSFlags);
- if (instructionWaitsForSALUWrites(MI)) {
+ if (instructionWaitsForSGPRWrites(MI)) {
auto It = State.find(lastSGPRfromVALU);
if (It != State.end()) {
DelayInfo Info = It->getSecond();
@@ -390,6 +394,7 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
lastSGPRfromVALU = 0;
}
}
+
if (instructionWaitsForVALU(MI)) {
// Forget about all outstanding VALU delays.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index c59f56d18d178..a75b5773b1cc1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -2854,7 +2854,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: flat_store_b32 v[0:1], v3
; GFX12-NEXT: s_endpgm
@@ -3842,7 +3841,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 00fd331162bdc..be894f2c76f67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -361,21 +361,21 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -385,21 +385,21 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv float %a, %b
@@ -2766,21 +2766,21 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2790,21 +2790,21 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
@@ -3981,21 +3981,21 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4005,21 +4005,21 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
; EG-LABEL: v_fdiv_f32_constrhs0_dynamic:
@@ -4359,21 +4359,21 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400
; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v2
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4383,21 +4383,21 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400
; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
; EG-LABEL: v_fdiv_f32_constlhs0_dynamic:
@@ -4732,21 +4732,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4756,21 +4756,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
@@ -5121,21 +5121,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-IEEE-NEXT: s_denorm_mode 15
; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5145,21 +5145,21 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-FLUSH-NEXT: s_denorm_mode 3
; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 6111e9a460e6c..bce06124f6db0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1072,14 +1072,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v2, v11
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
; GFX12-NEXT: s_wait_alu 0xf1fd
; GFX12-NEXT: s_setpc_b64 s[30:31]
@@ -2435,12 +2435,11 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
@@ -2449,31 +2448,29 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mov_b32_e32 v20, v22
; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mov_b32_e32 v19, v22
; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
; GFX12-NEXT: v_mov_b32_e32 v20, v18
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11
; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12
@@ -2515,10 +2512,9 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
; GFX12-NEXT: s_wait_alu 0xf1fd
; GFX12-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 3701a96587757..f2b5a39d88c90 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2145,12 +2145,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
@@ -2189,12 +2188,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
@@ -2232,7 +2230,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_mov_b32 s2, -1
@@ -2272,7 +2270,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: s_mov_b32 s2, -1
@@ -3244,7 +3242,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc
@@ -3329,7 +3327,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10
; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
@@ -4068,7 +4066,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4105,7 +4102,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: v_mul_lo_u32 v0, s4, v0
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -4144,7 +4140,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0
; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4182,7 +4177,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_mul_lo_u32 v0, s4, v0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -5716,7 +5710,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v5, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -5761,7 +5754,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v5, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -5805,7 +5797,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v4, vcc
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -5848,7 +5839,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v4, vcc_lo
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -6818,7 +6808,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc
@@ -6903,7 +6893,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s3, v9
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10
; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 7c008a54e8e94..6182fb51f8d16 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1905,7 +1905,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
@@ -1942,7 +1941,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
@@ -2739,7 +2737,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32 v7, vcc, s3, v9
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v8, vcc, s4, v10, vcc
@@ -2817,7 +2815,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v10
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo
@@ -3815,7 +3813,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -5341,7 +5338,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v5, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
@@ -5380,7 +5376,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v5, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
@@ -6175,7 +6170,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_sub_co_u32 v7, vcc, s3, v9
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v8, vcc, s4, v10, vcc
@@ -6253,7 +6248,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v6
; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1132_DPP-NEXT: v_readfirstlane_b32 s4, v9
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v10
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index f975c830aa2aa..aabcd69c88ca3 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2782,7 +2782,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s7, s5, s0
; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1
@@ -2807,7 +2806,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s6, s10, s0
; GFX11-NEXT: s_mul_hi_u32 s5, s10, s0
@@ -2829,7 +2827,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_i32 s0, s0, s7
; GFX11-NEXT: v_sub_co_u32 v0, s7, s10, s12
; GFX11-NEXT: s_mul_i32 s6, s3, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s0, s0, s6
; GFX11-NEXT: v_sub_co_u32 v1, s12, v0, s2
; GFX11-NEXT: s_sub_i32 s6, s11, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 08c6dcc3b0ea9..07c9521e7646a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -15305,7 +15305,6 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15467,7 +15466,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15641,7 +15639,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15809,7 +15806,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15966,7 +15962,6 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16122,7 +16117,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -16289,7 +16283,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16451,7 +16444,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index e2ca887389b1e..444decac9aaf7 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -7216,7 +7216,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -8876,7 +8876,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -10309,7 +10309,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -11224,7 +11224,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -13272,7 +13272,7 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index fa0689b45257a..10849e248288a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -7544,7 +7544,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -9203,7 +9203,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -10636,7 +10636,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -11551,7 +11551,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
@@ -13598,7 +13598,7 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index a6be79135ec5d..dcb1d0e8c20a1 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5125,30 +5125,30 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v5, v2
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v1, v7, v6
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v4, v3
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v2, 0
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_add3_u32 v4, v4, v6, v2
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v3, v1
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v5, v4, v0
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v5, v1, v2
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v0, v4
-; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v4, v1, v3
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1]
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -5163,19 +5163,18 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1]
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v7, v4, v2
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1]
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v4, v2
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v0, v5, vcc_lo
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v3, v[1:2]
; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
@@ -5184,16 +5183,15 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v4
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v2, v[3:4]
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v5, v7, v4
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v6, v[0:1]
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v5, v8
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v4, v[2:3]
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v5, v3, v[0:1]
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v5, v8
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v8, v[2:3]
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 922d2e86ea6b2..54ab293689bb1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -116,11 +116,10 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
@@ -297,11 +296,10 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 03567c8dcbbc4..240e91e02fec7 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -7211,7 +7211,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: .LBB28_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_mov_b32 s7, exec_lo
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
@@ -8068,7 +8068,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: ds_add_rtn_f32 v1, v2, v1
; GFX11-NEXT: .LBB29_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_mov_b32 s7, exec_lo
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 34f17bdde2864..263dc051737a5 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -382,7 +382,6 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v13, v14, v[8:9]
; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
; GFX12-NEXT: s_wait_alu 0xfffd
@@ -1158,7 +1157,7 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
@@ -1249,11 +1248,10 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
; GFX12-NEXT: s_wait_alu 0xfffd
@@ -1798,11 +1796,9 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 84f5be5fd6d36..7223afd99d536 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -878,11 +878,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v14, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(9)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v10, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(6)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v7, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v8, v3, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -892,27 +891,24 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v2, s0, v19, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v20, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v17, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v18, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v15, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v16, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v21, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v22, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v23, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v24, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v25, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v26, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v27, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v28, v3, vcc_lo
; GFX11-NEXT: s_cbranch_scc0 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index b05a62b3fcd37..952137e2ff870 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -217,6 +217,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-SDAG-NEXT: s_mul_f32 s0, s2, 0x37800000
; GFX12-SDAG-NEXT: v_cmp_class_f32_e64 s3, s1, 0x260
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_cselect_b32 s0, s0, s2
; GFX12-SDAG-NEXT: s_and_b32 s2, s3, exec_lo
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
More information about the llvm-commits
mailing list