[llvm] [AMDGPU] Remove s_delay_alu for VALU->SGPR->SALU (PR #127212)
Ana Mihajlovic via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 10 07:41:17 PDT 2025
https://github.com/mihajlovicana updated https://github.com/llvm/llvm-project/pull/127212
>From 6e5f8ce90ee538b814e0776c22398d1f2d5bba96 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Sat, 15 Feb 2025 02:46:58 +0100
Subject: [PATCH 1/3] [AMDGPU] Remove s_delay_alu for VALU->SGPR->SALU
---
.../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 5 +-
.../GlobalISel/llvm.amdgcn.intersect_ray.ll | 8 +--
.../AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll | 1 -
.../AMDGPU/atomic_optimizations_buffer.ll | 12 ----
.../atomic_optimizations_global_pointer.ll | 20 ++----
.../atomic_optimizations_local_pointer.ll | 70 ++++++-------------
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 8 ---
.../atomic_optimizations_struct_buffer.ll | 8 ---
llvm/test/CodeGen/AMDGPU/bf16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 1 -
.../buffer-fat-pointer-atomicrmw-fadd.ll | 32 +++------
.../buffer-fat-pointer-atomicrmw-fmax.ll | 34 +++------
.../buffer-fat-pointer-atomicrmw-fmin.ll | 34 +++------
.../test/CodeGen/AMDGPU/carryout-selection.ll | 13 ++--
.../CodeGen/AMDGPU/combine-add-zext-xor.ll | 16 ++---
.../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 39 +----------
.../expand-scalar-carry-out-select-user.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 10 ++-
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 16 ++---
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 17 ++---
llvm/test/CodeGen/AMDGPU/fp-classify.ll | 8 +--
llvm/test/CodeGen/AMDGPU/fptrunc.ll | 10 ++-
llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 16 ++---
.../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 12 ++--
llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir | 3 +
.../insert_waitcnt_for_precise_memory.ll | 10 ++-
...e92561-restore-undef-scc-verifier-error.ll | 12 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll | 54 +++++++++-----
...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 1 -
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 12 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 12 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll | 1 -
.../AMDGPU/llvm.amdgcn.s.ttracedata.ll | 19 +++--
....amdgcn.struct.buffer.load.format.v3f16.ll | 4 +-
...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 2 -
...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 2 -
...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 2 -
...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 7 +-
...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 7 +-
...gcn.struct.ptr.buffer.load.format.v3f16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 12 ++--
llvm/test/CodeGen/AMDGPU/min.ll | 4 --
...uf-legalize-operands-non-ptr-intrinsics.ll | 20 +++---
.../CodeGen/AMDGPU/mubuf-legalize-operands.ll | 20 +++---
.../CodeGen/AMDGPU/no-dup-inst-prefetch.ll | 3 +-
.../AMDGPU/pseudo-scalar-transcendental.ll | 2 +-
llvm/test/CodeGen/AMDGPU/saddo.ll | 9 ++-
llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 2 -
llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll | 5 +-
llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 6 +-
...r-descriptor-waterfall-loop-idom-update.ll | 2 +-
53 files changed, 236 insertions(+), 409 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index b25619b4c5422..262368f71fe35 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -362,7 +362,10 @@ class AMDGPUInsertDelayAlu {
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
auto It = State.find(Unit);
if (It != State.end()) {
- Delay.merge(It->second);
+ if (!(SII->isSALU(MI.getOpcode())) ||
+ !AMDGPU::isSGPR(Op.getReg(), TRI) ||
+ It->second.VALUCycles == 0)
+ Delay.merge(It->second);
State.erase(Unit);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index b26ddbdd7a342..12ebf3f6879d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -234,8 +234,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v20, v21, v[15:17], v[5:7], v[8:10]], s[4:7]
; GFX11-NEXT: ; implicit-def: $vgpr18
@@ -360,8 +360,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v16, v17, v[13:15], v[4:6]], s[4:7] a16
; GFX11-NEXT: ; implicit-def: $vgpr18
@@ -476,8 +476,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -604,8 +604,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[17:18], v19, v[14:16], v[20:22]], s[4:7] a16
; GFX11-NEXT: ; implicit-def: $vgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
index af50f56a87226..7d084582273d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
@@ -1468,7 +1468,6 @@ define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; encoding: [0x00,0x05,0x00,0x7e]
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0 ; encoding: [0x01,0x10,0x00,0xb9]
; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index cd405fabf002d..4b68f8a4bd194 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -777,7 +777,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -822,7 +821,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -864,7 +862,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -910,7 +907,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -1178,7 +1174,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1226,7 +1221,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1270,7 +1264,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1319,7 +1312,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -2246,7 +2238,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2291,7 +2282,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2334,7 +2324,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2380,7 +2369,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 3737cc414c58f..56f909d78dcc5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -899,7 +899,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -950,7 +949,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -999,7 +997,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -1049,7 +1046,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
@@ -2576,17 +2572,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2639,7 +2634,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
@@ -4454,7 +4448,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
@@ -4505,7 +4498,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
@@ -4554,7 +4546,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
@@ -4604,7 +4595,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
@@ -6164,17 +6154,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6227,7 +6216,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 3c0646c46efd0..eb5353e928682 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -669,7 +669,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -715,7 +714,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -1215,7 +1213,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -1248,7 +1246,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
@@ -2217,17 +2215,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2275,7 +2272,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
@@ -3019,11 +3015,10 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
@@ -3059,7 +3054,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
@@ -4091,7 +4085,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
@@ -4137,7 +4130,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
@@ -4637,7 +4629,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
@@ -4670,7 +4662,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
@@ -5662,17 +5654,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5720,7 +5711,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
@@ -6508,7 +6498,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
@@ -6554,7 +6543,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
@@ -7873,7 +7861,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
@@ -7919,7 +7906,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
@@ -9237,7 +9223,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
@@ -9283,7 +9268,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
@@ -10601,7 +10585,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
@@ -10647,7 +10630,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
@@ -11516,13 +11498,12 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -11574,13 +11555,12 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -12435,7 +12415,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
@@ -12481,7 +12460,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
@@ -13350,13 +13328,12 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -13408,13 +13385,12 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -14269,7 +14245,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
@@ -14315,7 +14290,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
@@ -15173,14 +15147,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -15230,14 +15204,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s8, s[0:1], s[6:7]
; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -16088,7 +16062,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
@@ -16134,7 +16107,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
@@ -16993,14 +16965,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -17050,14 +17022,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[6:7]
; GFX1132_ITERATIVE-NEXT: s_and_b32 s8, s8, exec_lo
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 8c6224cc86284..0a06fe4ea949e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -776,7 +776,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -821,7 +820,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -863,7 +861,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -909,7 +906,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -1834,7 +1830,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1
@@ -1879,7 +1874,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1
@@ -1922,7 +1916,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1
@@ -1968,7 +1961,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 63b46eba41225..bc0bec4772e52 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -797,7 +797,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -843,7 +842,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -885,7 +883,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
@@ -932,7 +929,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -2006,7 +2002,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2052,7 +2047,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2095,7 +2089,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2142,7 +2135,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 91598496eb984..774bd9d02fd4c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2375,17 +2375,17 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 2cd15210eb8e5..6f6337dcec514 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -264,7 +264,6 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB2_1
; GFX11-NEXT: ; %bb.3: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index a029164b7acd8..0b3a7dc2e176d 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -386,7 +386,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -444,8 +443,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
@@ -2374,7 +2373,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -2406,7 +2404,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -2474,10 +2471,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -2505,8 +2501,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
@@ -4120,7 +4116,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4156,7 +4151,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4267,8 +4261,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -4300,8 +4294,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
@@ -5564,7 +5558,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -5611,7 +5604,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -5730,8 +5722,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -5774,8 +5766,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -6735,7 +6727,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6794,8 +6785,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -6823,8 +6814,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
@@ -9044,7 +9035,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -9153,8 +9143,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -9201,8 +9191,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 38adf60888eca..96b9964e39dc4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -378,7 +378,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -469,8 +468,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_max_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
@@ -1587,7 +1586,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -1621,7 +1619,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -1689,10 +1686,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -1722,8 +1718,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
@@ -3221,7 +3217,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -3260,7 +3255,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -3373,8 +3367,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -3409,8 +3403,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -4685,7 +4679,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4732,7 +4725,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4851,8 +4843,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -4895,8 +4887,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -6014,7 +6006,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6048,7 +6039,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6146,8 +6136,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -6177,8 +6167,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
@@ -7467,7 +7457,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -7519,7 +7508,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -7634,8 +7622,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -7682,8 +7670,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 2b8cea9068d87..1cf4fa6da1627 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -378,7 +378,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -469,8 +468,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_min_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
@@ -1587,7 +1586,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -1621,7 +1619,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -1689,10 +1686,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: v_readfirstlane_b32 s6, v7
; GFX11-NEXT: v_readfirstlane_b32 s7, v8
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -1722,8 +1718,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
@@ -3221,7 +3217,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -3260,7 +3255,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -3373,8 +3367,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -3409,8 +3403,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -4685,7 +4679,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4732,7 +4725,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -4851,8 +4843,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -4895,8 +4887,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
@@ -6014,7 +6006,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6048,7 +6039,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -6146,8 +6136,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -6177,8 +6167,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
@@ -7467,7 +7457,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -7519,7 +7508,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
@@ -7634,8 +7622,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
; GFX11-NEXT: ; implicit-def: $vgpr4
@@ -7682,8 +7670,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index cdea4fd158b04..03c63063101bb 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2759,7 +2759,6 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_mul_i32 s7, s5, s0
; GFX11-NEXT: s_mul_hi_u32 s13, s5, s1
; GFX11-NEXT: s_mul_i32 s12, s6, s1
@@ -2781,12 +2780,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_u32 s1, s1, s7
; GFX11-NEXT: s_addc_u32 s7, 0, s12
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s7, s5, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1
; GFX11-NEXT: s_mul_i32 s6, s6, s1
; GFX11-NEXT: s_add_i32 s7, s12, s7
@@ -2807,9 +2805,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_u32 s1, s1, s7
; GFX11-NEXT: s_addc_u32 s5, 0, s5
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s6, s10, s0
; GFX11-NEXT: s_mul_hi_u32 s5, s10, s0
@@ -2881,18 +2879,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_mul_i32 s1, s1, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1
-; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: s_mul_hi_u32 s0, s10, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s1, s0, s2
; GFX11-NEXT: s_add_i32 s3, s0, 1
; GFX11-NEXT: s_sub_i32 s1, s10, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s4, s1, s2
; GFX11-NEXT: s_cmp_ge_u32 s1, s2
; GFX11-NEXT: s_cselect_b32 s0, s3, s0
diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
index b42542db6dbd8..aa38bdf6c1b1a 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
@@ -38,12 +38,12 @@ define i32 @combine_add_zext_xor() {
; GFX1100-NEXT: s_branch .LBB0_2
; GFX1100-NEXT: .LBB0_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB0_4
; GFX1100-NEXT: .LBB0_2: ; %.a
@@ -118,12 +118,12 @@ define i32 @combine_sub_zext_xor() {
; GFX1100-NEXT: s_branch .LBB1_2
; GFX1100-NEXT: .LBB1_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB1_4
; GFX1100-NEXT: .LBB1_2: ; %.a
@@ -365,11 +365,11 @@ define i32 @combine_add_zext_and() {
; GFX1100-NEXT: s_branch .LBB4_2
; GFX1100-NEXT: .LBB4_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_add_nc_u32_e32 v1, v1, v0
; GFX1100-NEXT: s_cbranch_vccz .LBB4_4
; GFX1100-NEXT: .LBB4_2: ; %.a
@@ -444,11 +444,11 @@ define i32 @combine_sub_zext_and() {
; GFX1100-NEXT: s_branch .LBB5_2
; GFX1100-NEXT: .LBB5_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: s_and_b32 s0, s0, vcc_lo
-; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_sub_nc_u32_e32 v1, v1, v0
; GFX1100-NEXT: s_cbranch_vccz .LBB5_4
; GFX1100-NEXT: .LBB5_2: ; %.a
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index d61c4b46596c0..e0a83c7644b14 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -320,7 +320,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB3_1
@@ -348,7 +347,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -441,7 +439,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB4_1
@@ -469,7 +466,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB4_1
@@ -556,7 +552,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB5_1
@@ -584,7 +579,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB5_1
@@ -744,7 +738,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4
; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s2, s2, s5
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_2
@@ -761,7 +754,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 s1, s32
; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16
; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc
@@ -799,7 +791,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_2
@@ -962,7 +953,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB7_2
@@ -1011,7 +1001,6 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-GISEL-NEXT: s_bitset0_b32 s0, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s4
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_2
@@ -1132,7 +1121,6 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB8_1
@@ -1162,7 +1150,6 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB8_1
@@ -1266,7 +1253,7 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100
; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
@@ -1300,7 +1287,7 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100
; GFX11-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
@@ -1397,7 +1384,6 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB10_1
@@ -1427,7 +1413,6 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB10_1
@@ -1525,7 +1510,6 @@ define void @test_dynamic_stackalloc_device_divergent() {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB11_1
@@ -1557,7 +1541,6 @@ define void @test_dynamic_stackalloc_device_divergent() {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB11_1
@@ -1670,7 +1653,6 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB12_1
@@ -1705,7 +1687,6 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB12_1
@@ -1805,7 +1786,6 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB13_1
@@ -1837,7 +1817,6 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB13_1
@@ -2048,7 +2027,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s5, v1, s4
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s5
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2
@@ -2069,7 +2047,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5
; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_4
@@ -2094,7 +2071,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v1, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
@@ -2137,7 +2113,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s5, v2, s4
; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_2
@@ -2155,7 +2130,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s6, v1, s5
; GFX11-GISEL-NEXT: s_bitset0_b32 s4, s5
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s3, s3, s6
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_4
@@ -2179,7 +2153,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_7
@@ -2374,7 +2347,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v1, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2
@@ -2402,7 +2374,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_6
@@ -2444,7 +2415,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_2
@@ -2473,7 +2443,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3
; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_6
@@ -2585,7 +2554,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB16_1
@@ -2617,7 +2585,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB16_1
@@ -2711,7 +2678,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB17_1
@@ -2741,7 +2707,6 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2
; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB17_1
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 598cdddaa53d1..f3aec696abdee 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -66,10 +66,10 @@ define i32 @s_add_co_select_user() {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s1, s0, 0
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
; GFX11-NEXT: s_cselect_b32 s1, s1, 0
; GFX11-NEXT: s_cmp_gt_u32 s0, 31
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index fec04a27cda91..19deaf4a5535e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1691,27 +1691,25 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s6, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_or_b32 s3, s5, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s5, s3, 0x1000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b32 s7, s5, s6
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshr_b32 s7, s5, s6
; GFX11-NEXT: s_lshl_b32 s6, s7, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_lg_u32 s6, s5
; GFX11-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-NEXT: s_or_b32 s5, s7, s5
; GFX11-NEXT: s_lshl_b32 s6, s2, 12
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s6, s3, s6
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-NEXT: s_cselect_b32 s5, s5, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s6, s5, 7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-NEXT: s_cselect_b32 s7, 1, 0
; GFX11-NEXT: s_cmp_eq_u32 s6, 3
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 0b6bdedeb48fc..5dd96237685dd 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -15303,9 +15303,9 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15465,9 +15465,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15639,9 +15639,9 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15807,9 +15807,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15964,9 +15964,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16120,9 +16120,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -16287,9 +16287,9 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16449,9 +16449,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index eefcf56586033..cbfae817daade 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -329,10 +329,9 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -344,10 +343,9 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -381,7 +379,7 @@ define i32 @test_D139469_f16(half %arg) {
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
@@ -505,10 +503,9 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2.h
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -528,10 +525,9 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -578,12 +574,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: s_or_b32 s0, s1, s2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index cc11e256d5544..6a0d52962265d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -316,8 +316,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -368,8 +368,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2
; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s3|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -423,8 +423,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2
; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -578,8 +578,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3
; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 4bab6eaab6f7d..d1403b6c1a01d 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -418,26 +418,24 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6
; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5
; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6
; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0
; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 5fab0c50bbe57..0c5b8b096d910 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -112,11 +112,10 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s2, s2, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
; GFX11-NEXT: s_mov_b64 s[2:3], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
@@ -272,11 +271,10 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s2, s2, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
; GFX11-NEXT: s_mov_b64 s[2:3], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
@@ -430,11 +428,10 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s5, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s4, s4, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: s_add_i32 s5, s5, s6
@@ -576,11 +573,10 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s3, s3, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_add_i32 s4, s4, s5
@@ -960,16 +956,15 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v2|, |v0|
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: s_add_i32 s3, s3, 1
; GFX11-NEXT: v_mov_b32_e32 v3, s5
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
; GFX11-NEXT: s_cbranch_scc0 .LBB6_1
@@ -1097,7 +1092,6 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0|
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s5, s5, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index ea3d57d127151..44b1bb25bc057 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -118,14 +118,14 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, 1
-; GFX11-NEXT: s_lshr_b32 s0, s0, s30
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshr_b32 s0, s0, s30
; GFX11-NEXT: s_mul_i32 s0, s0, s22
-; GFX11-NEXT: s_mul_i32 s0, s0, s20
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_mul_i32 s0, s0, s20
; GFX11-NEXT: s_or_b32 s0, s19, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1
; GFX11-NEXT: s_mov_b32 s0, s1
; GFX11-NEXT: global_load_u16 v1, v0, s[20:21]
@@ -145,7 +145,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
; GFX11-NEXT: s_and_b32 s1, s8, s1
; GFX11-NEXT: s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
; GFX11-NEXT: s_cselect_b32 s1, s19, s13
; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
@@ -155,12 +155,12 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
; GFX11-NEXT: v_readfirstlane_b32 s13, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
; GFX11-NEXT: s_cselect_b32 s13, s19, s13
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_bitcmp1_b32 s13, 0
; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s13, s0
; GFX11-NEXT: s_cbranch_vccz .LBB2_6
; GFX11-NEXT: ; %bb.7: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
index 5c9c0d1119163..9a2b2bebd16fe 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
---
@@ -559,3 +560,5 @@ body: |
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 5256cbcef123a..2f37b45651234 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -555,17 +555,16 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s5, s5, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s5
-; GFX11-NEXT: s_add_i32 s4, s4, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s4, s4, s5
; GFX11-NEXT: s_mul_hi_u32 s4, s2, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s5, s4, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s2, s2, s5
; GFX11-NEXT: s_add_i32 s5, s4, 1
; GFX11-NEXT: s_sub_i32 s6, s2, s3
@@ -590,12 +589,11 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_cvt_u32_f32 s4, s4
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_i32 s5, s5, s4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5
diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
index f961e857f39e5..2053ae970c773 100644
--- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll
@@ -33,14 +33,12 @@ define void @issue92561(ptr addrspace(1) %arg) {
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
; SDAG-NEXT: v_cmp_eq_u64_e64 s1, s[8:9], v[4:5]
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: v_cmp_eq_u64_e64 s2, s[10:11], v[6:7]
; SDAG-NEXT: s_and_b32 s0, vcc_lo, s0
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: s_and_b32 s0, s0, s1
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; SDAG-NEXT: s_and_b32 s0, s0, s2
-; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG-NEXT: s_and_saveexec_b32 s0, s0
; SDAG-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[4:11], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
@@ -108,14 +106,12 @@ define void @issue92561(ptr addrspace(1) %arg) {
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[12:13], v[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[14:15], v[6:7]
; GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[16:17], v[0:1]
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GISEL-NEXT: v_cmp_eq_u64_e64 s2, s[18:19], v[2:3]
; GISEL-NEXT: s_and_b32 s0, vcc_lo, s0
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GISEL-NEXT: s_and_b32 s0, s0, s1
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GISEL-NEXT: s_and_b32 s0, s0, s2
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_and_saveexec_b32 s0, s0
; GISEL-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
index 840287b10bb49..3c1d70cfffabf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
@@ -132,25 +132,43 @@ define amdgpu_kernel void @id_arg_i32(i32 %row) #0 {
; Divergent row number just causes a readfirstlane for now.
define amdgpu_kernel void @id_row_i32() #0 {
-; GFX11-LABEL: id_row_i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x63
-; GFX11-NEXT: s_mov_b32 m0, s0
-; GFX11-NEXT: exp pos0 v0, off, off, off done row_en
-; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: id_row_i32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
+; GFX11-SDAG-NEXT: s_mov_b32 m0, s0
+; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en
+; GFX11-SDAG-NEXT: s_endpgm
;
-; GFX12-LABEL: id_row_i32:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-NEXT: v_mov_b32_e32 v0, 0x63
-; GFX12-NEXT: s_mov_b32 m0, s0
-; GFX12-NEXT: export pos0 v0, off, off, off done row_en
-; GFX12-NEXT: s_endpgm
+; GFX11-GISEL-LABEL: id_row_i32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x63
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GFX11-GISEL-NEXT: exp pos0 v1, off, off, off done row_en
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: id_row_i32:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT: export pos0 v0, off, off, off done row_en
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: id_row_i32:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x63
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GFX12-GISEL-NEXT: export pos0 v1, off, off, off done row_en
+; GFX12-GISEL-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 99, i32 undef, i32 undef, i32 undef, i1 true, i32 %id)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
index 08d2201036c77..8b6ba1a3cc094 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -83,7 +83,6 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index f72f1e52d135f..deeceed3a19be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -428,7 +428,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -449,7 +448,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -471,7 +469,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -492,7 +489,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -809,7 +805,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_max_u32 s6, s6, s8
@@ -847,7 +843,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_mov_b32 s6, 0
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_max_u32 s6, s6, s8
@@ -884,7 +880,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6
@@ -922,7 +918,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index 4551c60770bdf..434e761a5f8a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -429,7 +429,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6
; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -450,7 +449,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6
; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -472,7 +470,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5
; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -493,7 +490,6 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5
; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
@@ -810,7 +806,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1
; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164DAGISEL-NEXT: s_min_u32 s6, s6, s8
@@ -848,7 +844,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_mov_b32 s6, -1
; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3]
-; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7
; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7
; GFX1164GISEL-NEXT: s_min_u32 s6, s6, s8
@@ -885,7 +881,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1
; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6
@@ -923,7 +919,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_mov_b32 s0, -1
; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
-; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3
; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3
; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s6
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
index cf86e2e1dedee..f2ee110c28c6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
@@ -13,7 +13,6 @@ define void @test_s_sleep_var1(i32 %arg) {
; GCN-NEXT: s_wait_bvhcnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_sleep_var s0
; GCN-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.sleep.var(i32 %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
index af792851e0ced..b6bf8bb1a19dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
@@ -26,13 +26,18 @@ define amdgpu_cs void @ttracedata_s(i32 inreg %val) {
}
define amdgpu_cs void @ttracedata_v(i32 %val) {
-; GFX11-LABEL: ttracedata_v:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: s_mov_b32 m0, s0
-; GFX11-NEXT: s_ttracedata
-; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: ttracedata_v:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: s_mov_b32 m0, s0
+; GFX11-SDAG-NEXT: s_ttracedata
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: ttracedata_v:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GFX11-GISEL-NEXT: s_ttracedata
+; GFX11-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.s.ttracedata(i32 %val)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index fed7a8ec105fd..ea8703df080d9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -99,8 +99,8 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -126,8 +126,8 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
index a2b9c869c9c9a..148a5ba75d98b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -52,7 +52,6 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
@@ -93,7 +92,6 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index c2a0028f4f1f1..746b8791c39f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -276,7 +276,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
@@ -415,7 +414,6 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 104462a506c8c..71c63bfd69734 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -221,7 +221,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
@@ -331,7 +330,6 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
index 13bb72a96142f..e3889ab8f5a21 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -428,8 +428,8 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s1, s1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_max_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
@@ -459,7 +459,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
@@ -578,10 +577,9 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
@@ -616,7 +614,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
index e75dd7409d51b..f001bf97fcd9e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -428,8 +428,8 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX11-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s1, s1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_atomic_min_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 glc
@@ -459,7 +459,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
@@ -578,10 +577,9 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
@@ -616,7 +614,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
index 46b2516f72f8e..9018160806925 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
@@ -97,8 +97,8 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 1fc7349882ba1..a9240eff8e691 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -1385,8 +1385,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_and_b32 v0, 0xffff, v0
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003
; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10005
@@ -1690,7 +1690,6 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s6, 0xffff, s2
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10003
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10001
@@ -4904,13 +4903,13 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10002
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_and_b32 s2, s2, 1
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -5658,10 +5657,9 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_mov_b32_e32 v7, v1
; GFX12-NEXT: v_mov_b32_e32 v11, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_bfe_u32 v2, v4, 11, 1
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000a
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x1000d
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x1000c
@@ -6004,10 +6002,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: global_load_u16 v0, v32, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s3, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshr_b32 s4, s3, 15
; GFX12-NEXT: s_lshr_b32 s2, s3, 14
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: s_lshr_b32 s6, s3, 12
; GFX12-NEXT: s_lshr_b32 s8, s3, 13
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index a6db7d331cef3..aaf81e2fa4000 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -3476,7 +3476,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
@@ -3584,7 +3583,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
@@ -3692,7 +3690,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
@@ -3800,7 +3797,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, s4
; GFX11-NEXT: s_cselect_b32 s3, s3, s5
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index e44803d611f84..8426224d9dd50 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -97,8 +97,8 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -123,8 +123,8 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -399,8 +399,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -418,8 +418,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -448,8 +448,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -467,8 +467,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -910,8 +910,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[10:11], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -936,8 +936,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -970,8 +970,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -996,8 +996,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 896cb6042e810..1480743e435ff 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -96,8 +96,8 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -122,8 +122,8 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v5, v4, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -410,8 +410,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -429,8 +429,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -459,8 +459,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v13, v8, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -478,8 +478,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v0, v8, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -945,8 +945,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[10:11], v[2:3]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -971,8 +971,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W32-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
; GFX1100_W32-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX1100_W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W32-NEXT: s_and_saveexec_b32 s0, s0
; GFX1100_W32-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
; GFX1100_W32-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
@@ -1005,8 +1005,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v9, v8, s[8:11], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -1031,8 +1031,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100_W64-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
; GFX1100_W64-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX1100_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100_W64-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX1100_W64-NEXT: buffer_load_format_x v9, v0, s[4:7], 0 idxen
; GFX1100_W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index d62f045674ace..f43ca4fdc1762 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -92,10 +92,9 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX12-NEXT: image_sample_lz v1, [v2, v2, v1], s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: v_fma_f32 v1, v1, v0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_cmp_le_f32_e64 s0, 0, v1
; GFX12-NEXT: s_and_b32 s0, s0, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_or_b32 s2, s2, s0
; GFX12-NEXT: s_branch .LBB0_1
; GFX12-NEXT: .LBB0_4: ; %loop0_merge
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index e5e3ba6cdcaf0..b287412c24eb0 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -213,7 +213,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-SDAG-NEXT: s_mul_f32 s0, s2, 0x37800000
; GFX12-SDAG-NEXT: v_cmp_class_f32_e64 s3, s1, 0x260
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_cselect_b32 s0, s0, s2
; GFX12-SDAG-NEXT: s_and_b32 s2, s3, exec_lo
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index b4eb775008122..a63d9f22236d5 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -106,11 +106,11 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX11-NEXT: s_addc_u32 s7, s3, s5
; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0
; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s2, s4, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -439,8 +439,8 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
; GFX11-NEXT: v_mov_b32_e32 v0, s8
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s4, s6, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -560,10 +560,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 94b22b79f6632..0b68a0534fa08 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -452,10 +452,9 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -482,10 +481,9 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 80db5a7f1d794..62d59aafd80ca 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1349,7 +1349,6 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1
; GFX11-LABEL: no_skip_no_successors:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b64 vcc, exec, s[4:5]
; GFX11-NEXT: s_cbranch_vccz .LBB12_3
; GFX11-NEXT: ; %bb.1: ; %bb6
@@ -1361,7 +1360,6 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1
; GFX11-NEXT: s_mov_b64 exec, 0
; GFX11-NEXT: .LBB12_3: ; %bb3
; GFX11-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX11-NEXT: ; %bb.4: ; %bb5
; GFX11-NEXT: .LBB12_5:
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 2a2fd93bc2d0b..eb1b844ad8938 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -452,10 +452,9 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -482,10 +481,9 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
index 5b40d53e0a81c..bff5c6c0db365 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
@@ -6,9 +6,9 @@ define amdgpu_kernel void @icmp_test() {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: v_cmp_eq_u16_e64 s[0:1], 0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: ds_store_b32 v1, v0
; CHECK-NEXT: s_endpgm
@@ -27,11 +27,10 @@ define amdgpu_kernel void @fcmp_test(half %x, half %y) {
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s1, s0, 16
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: v_cmp_le_f16_e64 s[0:1], s0, s1
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: ds_store_b32 v1, v0
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index 5360ff2fa402f..a3f632267ccd6 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -1625,10 +1625,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX11-NEXT: global_store_b8 v0, v1, s[8:9]
; GFX11-NEXT: s_endpgm
@@ -1648,10 +1647,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX12-NEXT: global_store_b8 v0, v1, s[8:9]
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index 0211c5111c31d..e30c8a53b0571 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -57,8 +57,8 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: buffer_store_b32 v0, v0, s[4:7], 0 offen
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
>From 6567e2060a85715b64f98e31b13e06c6f91c950c Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Thu, 20 Feb 2025 13:22:58 +0100
Subject: [PATCH 2/3] added cycle reduction for instructions issued between
VALU->SGPR and SPGR->SALU
---
.../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 49 +++++++++++-
llvm/test/CodeGen/AMDGPU/bf16.ll | 5 +-
.../test/CodeGen/AMDGPU/carryout-selection.ll | 4 +-
.../CodeGen/AMDGPU/combine-add-zext-xor.ll | 6 +-
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 8 --
llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 7 +-
llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir | 74 +++++++++++++++++++
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 3 +-
8 files changed, 129 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 262368f71fe35..ca95805e25dba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -331,6 +331,11 @@ class AMDGPUInsertDelayAlu {
bool Changed = false;
MachineInstr *LastDelayAlu = nullptr;
+ bool VALUSALUStall = false;
+ MCRegUnit lastSgprWrite = 0;
+ MCRegUnit longestWait = 0;
+ unsigned deletedCyclesNum = 0;
+
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
for (auto &MI : MBB.instrs()) {
@@ -362,15 +367,51 @@ class AMDGPUInsertDelayAlu {
for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
auto It = State.find(Unit);
if (It != State.end()) {
- if (!(SII->isSALU(MI.getOpcode())) ||
- !AMDGPU::isSGPR(Op.getReg(), TRI) ||
- It->second.VALUCycles == 0)
+ if (SII->isSALU(MI.getOpcode()) &&
+ AMDGPU::isSGPR(Op.getReg(), TRI) &&
+ It->second.VALUCycles > 0) {
+ deletedCyclesNum = It->second.VALUCycles;
+ State.erase(Unit);
+ VALUSALUStall = true;
+ } else {
Delay.merge(It->second);
- State.erase(Unit);
+ State.erase(Unit);
+ }
}
}
}
}
+ unsigned maxCycles = 0;
+ unsigned lastWrite = 0;
+ if (Type != OTHER) {
+ for (const auto &Op : MI.defs()) {
+ for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
+ if (AMDGPU::isSGPR(Op.getReg(), TRI)) {
+ maxCycles =
+ (State.find(longestWait) == State.end())
+ ? std::max(deletedCyclesNum, (unsigned)0)
+ : std::max(State[longestWait].VALUCycles,
+ State[longestWait].SALUCycles);
+ lastWrite =
+ (State.find(lastSgprWrite) == State.end())
+ ? 0
+ : std::max(State[lastSgprWrite].VALUCycles,
+ State[lastSgprWrite].SALUCycles);
+ if (maxCycles <= lastWrite)
+ longestWait = lastSgprWrite;
+ lastSgprWrite = Unit;
+ }
+ }
+ }
+ }
+
+ if (VALUSALUStall) {
+ State.advance(VALU, maxCycles);
+ VALUSALUStall = false;
+ lastSgprWrite = 0;
+ longestWait = 0;
+ }
+
if (Emit && !MI.isBundledWithPred()) {
// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
// just ignore them?
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 774bd9d02fd4c..2a0961053dedd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2376,16 +2376,15 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 03c63063101bb..1bf38a4b51718 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2782,7 +2782,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s7, s5, s0
; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1
@@ -2807,7 +2807,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: s_mul_i32 s6, s10, s0
; GFX11-NEXT: s_mul_hi_u32 s5, s10, s0
diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
index aa38bdf6c1b1a..f5ae5c849df98 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
@@ -39,11 +39,10 @@ define i32 @combine_add_zext_xor() {
; GFX1100-NEXT: .LBB0_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB0_4
; GFX1100-NEXT: .LBB0_2: ; %.a
@@ -119,11 +118,10 @@ define i32 @combine_sub_zext_xor() {
; GFX1100-NEXT: .LBB1_1: ; %bb9
; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GFX1100-NEXT: s_xor_b32 s0, s0, -1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_mov_b32_e32 v1, v2
; GFX1100-NEXT: s_cbranch_vccz .LBB1_4
; GFX1100-NEXT: .LBB1_2: ; %.a
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 5dd96237685dd..07c9521e7646a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -15305,7 +15305,6 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15467,7 +15466,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15641,7 +15639,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -15809,7 +15806,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -15966,7 +15962,6 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16122,7 +16117,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
@@ -16289,7 +16283,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
@@ -16451,7 +16444,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 0c5b8b096d910..55a61e299768d 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -959,12 +959,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: s_add_i32 s3, s3, 1
; GFX11-NEXT: v_mov_b32_e32 v3, s5
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
; GFX11-NEXT: s_cbranch_scc0 .LBB6_1
@@ -1094,17 +1094,16 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
; GFX11-NEXT: s_cselect_b32 s5, s5, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, s5, v2
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
; GFX11-NEXT: s_add_i32 s3, s3, 1
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
; GFX11-NEXT: v_mov_b32_e32 v3, s5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
; GFX11-NEXT: s_cbranch_scc0 .LBB7_1
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
index 9a2b2bebd16fe..86219940ebcd9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -560,5 +560,79 @@ body: |
$vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
$vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
...
+
+# Check if s_delay_alu is added
+---
+name: redundant_delay_alu_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: redundant_delay_alu_1:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr0, $sgpr1, implicit $exec
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ $sgpr0= S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+...
+
+# Check if s_delay_alu is added
+---
+name: delay_alu
+body: |
+ bb.0:
+ ; CHECK-LABEL: delay_alu:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+...
+
+# Check if reduntant delay_alu is removed
+---
+name: redundant_delay_alu_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: redundant_delay_alu_2:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s6, s7
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ $sgpr6_sgpr7 = V_CMP_EQ_U32_e64 $sgpr6, $sgpr7, implicit $exec
+ $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+...
+
+# Check if reduntant delay_alu is removed
+---
+name: perserved_delay
+body: |
+ bb.0:
+ ; CHECK-LABEL: perserved_delay:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, s5
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ ; CHECK-NEXT: s_or_b32 s0, s0, s1
+ ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+ ; CHECK-NEXT: s_or_b32 s2, s0, s0
+ ; CHECK-NEXT: v_mul_f32_e64 v0, v0, v0
+ liveins : $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
+ $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr4, $sgpr5, implicit $exec
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ $sgpr0 = S_OR_B32 $sgpr0, $sgpr1, implicit-def $scc
+ $sgpr2 = S_OR_B32 $sgpr0, $sgpr0, implicit-def $scc
+ $vgpr0 = V_MUL_F32_e64 0, $vgpr0, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+...
## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
# CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index a9240eff8e691..af4fb0c4d6f6e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -4904,12 +4904,11 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10002
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10001
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_and_b32 s2, s2, 1
; GFX12-NEXT: s_wait_alu 0xfffe
>From 5441736d13a15a1892715c1a16bd5e7859ed9da7 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Mon, 10 Mar 2025 15:40:25 +0100
Subject: [PATCH 3/3] rebase
---
.../AMDGPU/buffer-fat-pointers-memcpy.ll | 2 --
.../AMDGPU/llvm.amdgcn.readfirstlane.m0.ll | 1 -
.../CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll | 18 ++++++------------
3 files changed, 6 insertions(+), 15 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 8e023723ec25c..ffa9b465af0dd 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -923,7 +923,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr
; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
; SDAG-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
; SDAG-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0
-; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1
; SDAG-GFX1100-NEXT: s_clause 0xf
; SDAG-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
@@ -1097,7 +1096,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) inreg %src, ptr
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v61, s0, v0
; GISEL-GFX1100-NEXT: v_add_nc_u32_e32 v65, s8, v0
; GISEL-GFX1100-NEXT: v_add_co_u32 v0, s1, 0x100, v0
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GISEL-GFX1100-NEXT: s_xor_b32 s1, s1, -1
; GISEL-GFX1100-NEXT: s_clause 0xf
; GISEL-GFX1100-NEXT: buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll
index 0ffee36d520dc..2fba9844fbcc1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.m0.ll
@@ -22,7 +22,6 @@ define void @test_readfirstlane_m0(i32 %arg) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_mov_b32 m0, s0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_INTERRUPT)
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
index b6bf8bb1a19dc..8aa8fac8b7985 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
@@ -26,18 +26,12 @@ define amdgpu_cs void @ttracedata_s(i32 inreg %val) {
}
define amdgpu_cs void @ttracedata_v(i32 %val) {
-; GFX11-SDAG-LABEL: ttracedata_v:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-SDAG-NEXT: s_mov_b32 m0, s0
-; GFX11-SDAG-NEXT: s_ttracedata
-; GFX11-SDAG-NEXT: s_endpgm
-;
-; GFX11-GISEL-LABEL: ttracedata_v:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 m0, v0
-; GFX11-GISEL-NEXT: s_ttracedata
-; GFX11-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: ttracedata_v:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mov_b32 m0, s0
+; GFX11-NEXT: s_ttracedata
+; GFX11-NEXT: s_endpgm
call void @llvm.amdgcn.s.ttracedata(i32 %val)
ret void
}
More information about the llvm-commits
mailing list