[llvm] [AMDGPU] Unused sdst writing to null (PR #133229)
Ana Mihajlovic via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 27 07:20:57 PDT 2025
https://github.com/mihajlovicana updated https://github.com/llvm/llvm-project/pull/133229
>From fc320644c89ae9961eb58d1cfdddf4203c1c397b Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Thu, 27 Mar 2025 11:33:21 +0100
Subject: [PATCH 1/2] [AMDGPU] Unused sdst writing to null
---
.../Target/AMDGPU/SIShrinkInstructions.cpp | 5 +
.../CodeGen/AMDGPU/GlobalISel/addsubu64.ll | 6 +-
.../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 24 +-
.../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 44 +-
.../GlobalISel/extractelement-stack-lower.ll | 9 +-
.../AMDGPU/GlobalISel/extractelement.i128.ll | 7 +-
.../AMDGPU/GlobalISel/extractelement.i16.ll | 14 +-
.../AMDGPU/GlobalISel/extractelement.i8.ll | 36 +-
.../llvm.amdgcn.global.atomic.csub.ll | 10 +-
.../GlobalISel/llvm.amdgcn.intersect_ray.ll | 195 +++++----
.../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll | 85 ++--
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 66 ++-
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 24 +-
.../AMDGPU/GlobalISel/shl-ext-reduce.ll | 2 +-
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 24 +-
llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll | 224 +++++-----
.../atomic_optimizations_global_pointer.ll | 176 ++++----
.../atomic_optimizations_local_pointer.ll | 121 +++---
llvm/test/CodeGen/AMDGPU/bf16.ll | 42 +-
.../test/CodeGen/AMDGPU/carryout-selection.ll | 126 +++---
.../AMDGPU/cgp-addressing-modes-flat.ll | 18 +-
.../CodeGen/AMDGPU/div-rem-by-constant-64.ll | 58 +--
llvm/test/CodeGen/AMDGPU/dpp64_combine.ll | 7 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 162 ++++----
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 162 ++++----
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 162 ++++----
.../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 172 ++++----
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 96 +++--
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 3 +-
llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll | 9 +-
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 10 +-
llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll | 19 +-
.../AMDGPU/gfx12_scalar_subword_loads.ll | 16 +-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 96 ++---
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 96 ++---
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 96 ++---
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 96 ++---
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 84 ++--
llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 15 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 175 ++++----
.../AMDGPU/llvm.amdgcn.intersect_ray.ll | 20 +-
.../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 12 +-
.../AMDGPU/llvm.amdgcn.s.prefetch.data.ll | 5 +-
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 48 +--
.../AMDGPU/load-constant-always-uniform.ll | 8 +-
llvm/test/CodeGen/AMDGPU/lrint.ll | 31 +-
llvm/test/CodeGen/AMDGPU/lround.ll | 30 +-
...ne-sink-temporal-divergence-swdev407790.ll | 7 +-
llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 59 ++-
.../match-perm-extract-vector-elt-bug.ll | 11 +-
.../CodeGen/AMDGPU/memintrinsic-unroll.ll | 126 +++---
llvm/test/CodeGen/AMDGPU/memmove-var-size.ll | 381 +++++++++---------
llvm/test/CodeGen/AMDGPU/mul.ll | 8 +-
llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 234 +++++------
.../CodeGen/AMDGPU/offset-split-global.ll | 192 ++++-----
.../AMDGPU/promote-constOffset-to-imm.ll | 216 +++++-----
.../AMDGPU/reassoc-mul-add-1-to-mad.ll | 12 +-
llvm/test/CodeGen/AMDGPU/saddo.ll | 5 +-
llvm/test/CodeGen/AMDGPU/saddsat.ll | 2 +-
llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll | 2 +-
.../CodeGen/AMDGPU/spill-scavenge-offset.ll | 96 ++---
llvm/test/CodeGen/AMDGPU/ssubsat.ll | 2 +-
llvm/test/CodeGen/AMDGPU/sub.ll | 17 +-
llvm/test/CodeGen/AMDGPU/uaddsat.ll | 4 +-
llvm/test/CodeGen/AMDGPU/udiv.ll | 8 +-
llvm/test/CodeGen/AMDGPU/usubsat.ll | 28 +-
llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll | 273 +++++++------
.../AMDGPU/vgpr-mark-last-scratch-load.ll | 8 +-
68 files changed, 2384 insertions(+), 2253 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index f03cde455f295..06d5e48cdab73 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -973,6 +973,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
continue;
}
+ if (TII->isVOP3(MI.getOpcode()) &&
+ TII->hasVALU32BitEncoding(MI.getOpcode())) {
+ tryReplaceDeadSDST(MI);
+ }
+
if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
// If there is no chance we will shrink it and use VCC as sdst to get
// a 32 bit form try to replace dead sdst with NULL.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index 6a04dd492fcea..425dd8acd4736 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -39,7 +39,8 @@ define amdgpu_ps void @v_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GCN-LABEL: v_add_u64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
; GCN-NEXT: s_endpgm
entry:
@@ -85,7 +86,8 @@ define amdgpu_ps void @v_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GCN-LABEL: v_sub_u64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4
-; GCN-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
; GCN-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index dce4048a4b87e..d9be677a0e58d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -1440,16 +1440,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_store_b32 v[0:1], v3
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1539,8 +1539,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:20
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -2090,16 +2090,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2194,8 +2194,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:40
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index e2d179a77f76c..92a7de9aaefd2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -2824,16 +2824,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_store_b32 v[0:1], v3
; GFX11-NEXT: s_endpgm
;
@@ -2846,15 +2846,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: flat_store_b32 v[0:1], v3
; GFX12-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2944,8 +2944,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -2963,8 +2963,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 42
; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -3810,16 +3810,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
;
@@ -3833,15 +3833,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3936,8 +3936,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3955,8 +3955,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index a948446aceff1..573017f7a948d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -29,10 +29,9 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
%elt = extractelement <64 x i32> %vec, i32 %idx
@@ -63,10 +62,9 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: global_load_u16 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
%vec = load <128 x i16>, ptr addrspace(1) %ptr
%elt = extractelement <128 x i16> %vec, i32 %idx
@@ -97,10 +95,9 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
%vec = load <32 x i64>, ptr addrspace(1) %ptr
%elt = extractelement <32 x i64> %vec, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index e1ce9ea14a2a9..c424738dceb0e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -126,8 +126,8 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -195,7 +195,8 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 021f609053a0f..63c3146ebeedd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -133,8 +133,8 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr,
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -199,7 +199,8 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -776,8 +777,8 @@ define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr,
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -842,7 +843,8 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index c2394ec461490..e6a02c6ae6919 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -131,9 +131,9 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_ashr_i32 s1, s0, 31
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -195,10 +195,10 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -267,8 +267,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %p
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -784,9 +784,9 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_ashr_i32 s1, s0, 31
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -848,10 +848,10 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -920,8 +920,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %p
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -1821,9 +1821,9 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_ashr_i32 s1, s0, 31
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -1885,10 +1885,10 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1957,8 +1957,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(ptr addrspace(4) inreg %
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 0f437114ea430..d0d4f4bedf314 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -38,7 +38,7 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -47,7 +47,8 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -101,7 +102,7 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -110,7 +111,8 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 809a3e0dd8ef5..c862335764dd4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -642,14 +642,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX1030-NEXT: v_mov_b32_e32 v2, s2
; GFX1030-NEXT: v_mov_b32_e32 v3, s3
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1030-NEXT: flat_load_dword v0, v[0:1]
; GFX1030-NEXT: flat_load_dword v1, v[2:3]
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
+; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
@@ -705,11 +705,12 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_mov_b32 s1, 1.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: flat_load_b32 v9, v[0:1]
; GFX11-NEXT: flat_load_b32 v10, v[2:3]
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
@@ -753,14 +754,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX1030-NEXT: v_mov_b32_e32 v2, s2
; GFX1030-NEXT: v_mov_b32_e32 v3, s3
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1030-NEXT: flat_load_dword v0, v[0:1]
; GFX1030-NEXT: flat_load_dword v1, v[2:3]
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
+; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
; GFX1030-NEXT: s_waitcnt vmcnt(0)
@@ -809,11 +810,12 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_mov_b32 s1, 1.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: flat_load_b32 v6, v[0:1]
; GFX11-NEXT: flat_load_b32 v7, v[2:3]
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8
@@ -844,34 +846,63 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
}
define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
-; GFX10-LABEL: image_bvh64_intersect_ray_nsa_reassign:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX10-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX10-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX10-NEXT: v_mov_b32_e32 v7, 4.0
-; GFX10-NEXT: v_mov_b32_e32 v8, 0x40a00000
-; GFX10-NEXT: v_mov_b32_e32 v9, 0x40c00000
-; GFX10-NEXT: v_mov_b32_e32 v10, 0x40e00000
-; GFX10-NEXT: v_mov_b32_e32 v11, 0x41000000
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: flat_load_dword v2, v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c7
-; GFX10-NEXT: v_bfrev_b32_e32 v1, 4.0
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX10-NEXT: s_endpgm
+; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX1030-NEXT: v_mov_b32_e32 v3, 0
+; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0
+; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000
+; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000
+; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
+; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s6
+; GFX1030-NEXT: v_mov_b32_e32 v1, s7
+; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1030-NEXT: flat_load_dword v2, v[0:1]
+; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
+; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0
+; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX1030-NEXT: s_endpgm
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
+; GFX1013: ; %bb.0:
+; GFX1013-NEXT: s_clause 0x1
+; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX1013-NEXT: v_mov_b32_e32 v3, 0
+; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0
+; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000
+; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000
+; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
+; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
+; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1013-NEXT: v_mov_b32_e32 v0, s6
+; GFX1013-NEXT: v_mov_b32_e32 v1, s7
+; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1013-NEXT: flat_load_dword v2, v[0:1]
+; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
+; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0
+; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
+; GFX1013-NEXT: s_waitcnt vmcnt(0)
+; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX1013-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0:
@@ -897,9 +928,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6
; GFX11-NEXT: v_mov_b32_e32 v1, s7
; GFX11-NEXT: s_mov_b32 s6, 2.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_mov_b32_e32 v2, s6
@@ -926,31 +957,57 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
}
define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
-; GFX10-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX10-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX10-NEXT: v_mov_b32_e32 v6, 0x44004200
-; GFX10-NEXT: v_mov_b32_e32 v7, 0x46004500
-; GFX10-NEXT: v_mov_b32_e32 v8, 0x48004700
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-NEXT: v_mov_b32_e32 v1, s7
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: flat_load_dword v2, v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c6
-; GFX10-NEXT: v_bfrev_b32_e32 v1, 4.0
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX10-NEXT: s_endpgm
+; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX1030-NEXT: v_mov_b32_e32 v3, 0
+; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200
+; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
+; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s6
+; GFX1030-NEXT: v_mov_b32_e32 v1, s7
+; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1030-NEXT: flat_load_dword v2, v[0:1]
+; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
+; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0
+; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX1030-NEXT: s_endpgm
+;
+; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
+; GFX1013: ; %bb.0:
+; GFX1013-NEXT: s_clause 0x1
+; GFX1013-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX1013-NEXT: v_mov_b32_e32 v3, 0
+; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200
+; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
+; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
+; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1013-NEXT: v_mov_b32_e32 v0, s6
+; GFX1013-NEXT: v_mov_b32_e32 v1, s7
+; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1013-NEXT: flat_load_dword v2, v[0:1]
+; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
+; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0
+; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
+; GFX1013-NEXT: s_waitcnt vmcnt(0)
+; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX1013-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0:
@@ -971,9 +1028,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX11-NEXT: s_mov_b32 s6, 2.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_mov_b32_e32 v2, s6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 31526bcfead4e..e88c5e78779b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -233,7 +233,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
; GFX12-LABEL: mubuf_store_vgpr_ptr_offset4294967296:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 4, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
@@ -266,7 +267,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
; GFX12-LABEL: mubuf_store_vgpr_ptr_offset4294967297:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 4
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 4, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
@@ -377,9 +379,9 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
@@ -415,9 +417,9 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(ptr addrspace(
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:1024
; GFX12-NEXT: s_endpgm
@@ -454,9 +456,9 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(ptr addrspace(
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:1024
; GFX12-NEXT: s_endpgm
@@ -498,8 +500,8 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
@@ -542,8 +544,8 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16380
; GFX12-NEXT: s_endpgm
@@ -586,8 +588,8 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16380
; GFX12-NEXT: s_endpgm
@@ -829,10 +831,10 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4294967296:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 4, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967296
%val = load volatile float, ptr addrspace(1) %gep
@@ -863,10 +865,10 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4294967297:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 4
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 4, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967297
%val = load volatile float, ptr addrspace(1) %gep
@@ -976,12 +978,11 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset
%val = load volatile float, ptr addrspace(1) %gep
@@ -1015,12 +1016,11 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(ptr addrspace(
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset
%gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 256
@@ -1055,12 +1055,11 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(ptr addrspace(
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 256
%gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %soffset
@@ -1100,11 +1099,10 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset
%val = load volatile float, ptr addrspace(1) %gep
@@ -1145,11 +1143,10 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset
%gep1 = getelementptr float, ptr addrspace(1) %gep0, i64 4095
@@ -1190,11 +1187,10 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 4095
%gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %voffset
@@ -1362,12 +1358,12 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
; GFX12-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4294967296:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 4, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 2
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1415,12 +1411,11 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
%result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
@@ -1596,11 +1591,10 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 0
; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 4, v1, vcc_lo
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
@@ -1649,12 +1643,11 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v5, v1, vcc_lo
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
%result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index bce06124f6db0..455446aa38c60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1046,8 +1046,8 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v3, v6, s0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7]
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1075,13 +1075,12 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
-; GFX12-NEXT: s_wait_alu 0xf1fd
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
@@ -2365,14 +2364,14 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v27, vcc_lo, 0, v24, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
@@ -2381,7 +2380,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX11-NEXT: v_mov_b32_e32 v21, v22
; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9
; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
; GFX11-NEXT: v_mov_b32_e32 v6, v25
; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15
@@ -2392,31 +2391,31 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13
-; GFX11-NEXT: v_add_co_ci_u32_e64 v8, s2, 0, v8, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2
; GFX11-NEXT: v_mov_b32_e32 v11, v1
; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12
; GFX11-NEXT: v_mov_b32_e32 v12, v24
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s2, 0, v8, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, s2
; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4
; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s4, 0, v10, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s4
; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s5, v23, v25, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v29, s4
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s2, v7, v20, s2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s2, v7, v21, s3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s1, v7, v30, s1
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v28, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v7, v22, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v23, v25, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v21, s3
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v22, s0
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -2442,16 +2441,16 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
@@ -2461,7 +2460,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_mov_b32_e32 v20, v22
; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
; GFX12-NEXT: v_mov_b32_e32 v19, v22
; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
@@ -2481,18 +2480,18 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
; GFX12-NEXT: v_mov_b32_e32 v14, v21
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
+; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, s4
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
; GFX12-NEXT: s_wait_alu 0xf1ff
@@ -2504,19 +2503,18 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v23, v22, s5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s4
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
-; GFX12-NEXT: s_wait_alu 0xf1fd
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 5ede348e51f54..723ad5646c0a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4157,7 +4157,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
@@ -4366,7 +4366,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s1, v1, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
@@ -4461,7 +4461,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s1, v1, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
@@ -4538,7 +4538,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
@@ -4709,7 +4709,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX11-LABEL: saddsat_i64_sv:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s1, v1, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
@@ -4781,7 +4781,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX11-LABEL: saddsat_i64_vs:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s1, v1, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
@@ -4893,9 +4893,9 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v7, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v12, 31, v9
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[4:5]
@@ -5434,7 +5434,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s3, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
@@ -5593,7 +5593,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, s3, v3, vcc_lo
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
@@ -5833,7 +5833,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v8
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, v3, v11, vcc_lo
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
@@ -5845,7 +5845,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v4, v12
; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, v7, v15, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 6928c5a025f18..ead26adbf588f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -303,7 +303,7 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
; GFX11-NEXT: global_store_b32 v[2:3], v1, off
; GFX11-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 9ebf89519d6c9..d6eb4b3477adb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4163,7 +4163,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
@@ -4372,7 +4372,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, s1, v1, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
@@ -4467,7 +4467,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
-; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v3, null, s1, v1, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
@@ -4544,7 +4544,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
@@ -4715,7 +4715,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX11-LABEL: ssubsat_i64_sv:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, s1, v1, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
@@ -4787,7 +4787,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX11-LABEL: ssubsat_i64_vs:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
-; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v3, null, s1, v1, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
@@ -4899,9 +4899,9 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v9, null, v1, v5, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v11, null, v3, v7, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v12, 31, v9
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[4:5]
@@ -5457,7 +5457,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0
; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
; GFX11-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, null, s3, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
@@ -5625,7 +5625,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
-; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v7, null, s3, v3, vcc_lo
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0
; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
@@ -5884,7 +5884,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8
; GFX11-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
; GFX11-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v19, null, v3, v11, vcc_lo
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
@@ -5898,7 +5898,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12
; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo
; GFX11-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v21, null, v7, v15, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 19a09d973098a..c7c9e90e19677 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -1259,9 +1259,9 @@ define amdgpu_kernel void @v4i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, s2, 4 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_u16 v1, s3, 4 op_sel_hi:[1,0]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x i16>, ptr addrspace(1) %in, align 4
@@ -1352,9 +1352,9 @@ define amdgpu_kernel void @v4f16_to_i64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_f16 v1, s3, 4.0 op_sel_hi:[1,0]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%load = load <4 x half>, ptr addrspace(1) %in, align 4
@@ -3105,101 +3105,102 @@ define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) {
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v4
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v5
; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, 0, v4
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, 0, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v11, null, 0, v5, vcc_lo
; GFX11-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_trunc_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v1
-; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v0
; GFX11-NEXT: v_mul_lo_u32 v6, v10, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add3_u32 v14, v1, v6, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_hi_u32 v15, v13, v0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v14, 0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v14, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v15, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v13, vcc_lo, v13, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v12, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, v12, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v7, v10, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_hi_u32 v11, v13, v0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v10, v1, v7, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v10, 0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v11, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v13, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v12, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, v12, v1, vcc_lo
; GFX11-NEXT: v_mul_hi_u32 v11, s4, v8
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s5, v8, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, s4, v10, 0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s5, v10, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v11, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v1, v7, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, v8
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX11-NEXT: v_mul_lo_u32 v8, v5, v6
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v6, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v9, v4, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_nc_u32_e32 v8, s5, v1
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, s0, v8, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, v7, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v7, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sub_nc_u32_e32 v8, s5, v1
; GFX11-NEXT: v_sub_co_u32 v11, s0, v0, v4
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
-; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, v1, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, null, v8, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v1, v5
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, null, 0, v8, s0
; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v1, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v5
; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v4
@@ -3207,15 +3208,16 @@ define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) {
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v5
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0
; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1
-; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v1 :: v_dual_cndmask_b32 v0, v6, v4
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: .LBB28_2: ; %Flow1
@@ -3263,101 +3265,102 @@ define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) {
; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v2
; GFX11-NEXT: v_cvt_f32_u32_e32 v5, v3
; GFX11-NEXT: v_sub_co_u32 v11, vcc_lo, 0, v2
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v12, vcc_lo, 0, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v12, null, 0, v3, vcc_lo
; GFX11-NEXT: v_fmamk_f32 v4, v5, 0x4f800000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_trunc_f32_e32 v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_fmamk_f32 v4, v5, 0xcf800000, v4
; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v5
-; GFX11-NEXT: v_cvt_u32_f32_e32 v14, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_u32_f32_e32 v14, v4
; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_lo_u32 v7, v12, v14
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add3_u32 v15, v5, v6, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_hi_u32 v16, v14, v4
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v15, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v15, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v13, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, v13, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mul_lo_u32 v6, v12, v14
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_hi_u32 v12, v14, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add3_u32 v11, v5, v7, v6
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v11, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v11, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v12, v5
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v14, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v13, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, v13, v5, vcc_lo
; GFX11-NEXT: v_mul_hi_u32 v11, s6, v8
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s7, v8, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, s6, v10, 0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s7, v10, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v11, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v7, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v4, v8
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo
; GFX11-NEXT: v_mul_lo_u32 v8, v3, v6
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v2, v6, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v9, v2, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v5, v5, v9, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_nc_u32_e32 v8, s7, v5
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, s0, v8, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, v7, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v7, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sub_nc_u32_e32 v8, s7, v5
; GFX11-NEXT: v_sub_co_u32 v11, s0, v4, v2
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s7, v5, vcc_lo
-; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, s7, v5, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, null, v8, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v5, v3
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, null, 0, v8, s0
; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v5, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v3
; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2
@@ -3365,15 +3368,16 @@ define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) {
; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v3
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v2, s0
; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1
-; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v3, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v3 :: v_dual_cndmask_b32 v4, v6, v4
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: .LBB28_6: ; %Flow
@@ -19752,21 +19756,25 @@ define <32 x half> @v_bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) {
; GFX11-NEXT: s_cbranch_execz .LBB127_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v15, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v13, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: .LBB127_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -19951,21 +19959,25 @@ define <32 x i16> @v_bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) {
; GFX11-NEXT: s_cbranch_execz .LBB128_2
; GFX11-NEXT: ; %bb.1: ; %cmp.true
; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v15, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v13, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: .LBB128_2: ; %end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 1a0c15e2b28ec..62083b3e67ab6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2600,9 +2600,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0
-; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -2654,9 +2654,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
-; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -2709,9 +2709,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0
-; GFX1264_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1264_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -2761,9 +2761,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
-; GFX1232_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1232_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -3156,33 +3156,33 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -3192,7 +3192,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3245,7 +3245,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v7, null, s3, v9, vcc
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], 0
; GFX1164_DPP-NEXT: s_endpgm
@@ -3263,26 +3263,26 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -3291,7 +3291,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v2, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s7, v2, 15
@@ -3330,7 +3330,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10
; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v9, null, s3, v11, vcc_lo
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
; GFX1132_DPP-NEXT: s_endpgm
@@ -3347,20 +3347,20 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -3368,7 +3368,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3376,7 +3376,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -3387,7 +3387,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3442,7 +3442,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8
; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v7, vcc, s3, v9, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v7, null, s3, v9, vcc
; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null
; GFX1264_DPP-NEXT: s_endpgm
@@ -3460,20 +3460,20 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -3481,7 +3481,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -3490,7 +3490,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1232_DPP-NEXT: v_readlane_b32 s4, v2, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s7, v2, 15
@@ -3530,7 +3530,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s2, v10
; GFX1232_DPP-NEXT: s_mov_b32 s2, s6
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v9, null, s3, v11, vcc_lo
; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null
; GFX1232_DPP-NEXT: s_endpgm
@@ -5322,9 +5322,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -5360,9 +5360,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1132-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -5401,9 +5401,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
; GFX1264-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1264-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -5440,9 +5440,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1232-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -5710,7 +5710,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v5, vcc
+; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v5, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
@@ -5754,7 +5754,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[4:5]
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v5, vcc_lo
+; GFX1132-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v5, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
@@ -5797,7 +5797,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v4, vcc
+; GFX1264-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v4, vcc
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX1264-NEXT: s_endpgm
@@ -5839,7 +5839,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5]
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v4, vcc_lo
+; GFX1232-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v4, vcc_lo
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX1232-NEXT: s_endpgm
@@ -6166,9 +6166,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -6220,9 +6220,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -6275,9 +6275,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1264_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1264_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -6327,9 +6327,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1232_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1232_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1232_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -6722,33 +6722,33 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -6758,7 +6758,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6811,7 +6811,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
-; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc
+; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e64 v7, null, s3, v9, vcc
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], 0
; GFX1164_DPP-NEXT: s_endpgm
@@ -6829,26 +6829,26 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -6857,7 +6857,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v2, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s7, v2, 15
@@ -6896,7 +6896,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10
; GFX1132_DPP-NEXT: s_mov_b32 s2, s6
-; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e64 v9, null, s3, v11, vcc_lo
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
; GFX1132_DPP-NEXT: s_endpgm
@@ -6913,20 +6913,20 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -6934,7 +6934,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6942,7 +6942,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1264_DPP-NEXT: v_readlane_b32 s2, v1, 31
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -6953,7 +6953,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7008,7 +7008,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8
; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e32 v7, vcc, s3, v9, vcc
+; GFX1264_DPP-NEXT: v_sub_co_ci_u32_e64 v7, null, s3, v9, vcc
; GFX1264_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264_DPP-NEXT: buffer_store_b64 v[6:7], off, s[0:3], null
; GFX1264_DPP-NEXT: s_endpgm
@@ -7026,20 +7026,20 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -7047,7 +7047,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -7056,7 +7056,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1232_DPP-NEXT: v_readlane_b32 s4, v2, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s7, v2, 15
@@ -7096,7 +7096,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s2, v10
; GFX1232_DPP-NEXT: s_mov_b32 s2, s6
; GFX1232_DPP-NEXT: s_wait_alu 0xfffd
-; GFX1232_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s3, v11, vcc_lo
+; GFX1232_DPP-NEXT: v_sub_co_ci_u32_e64 v9, null, s3, v11, vcc_lo
; GFX1232_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], null
; GFX1232_DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 73cfdac8281a2..9775a37276dfd 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2239,9 +2239,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0
-; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -2288,9 +2288,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
-; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -2656,42 +2656,42 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2740,7 +2740,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32 v7, vcc, s3, v9
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v8, vcc, s4, v10, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v8, null, s4, v10, vcc
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1164_DPP-NEXT: s_endpgm
@@ -2758,26 +2758,26 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -2786,7 +2786,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2818,7 +2818,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s3, v10
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v9, null, s4, v11, vcc_lo
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
; GFX1132_DPP-NEXT: s_endpgm
@@ -3311,38 +3311,38 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0
; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v4, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0
@@ -3371,31 +3371,30 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v2, v3
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v2
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v3
@@ -5058,9 +5057,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -5092,9 +5091,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1132-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -5338,7 +5337,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v5, vcc
+; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s4, v5, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
;
@@ -5376,7 +5375,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v5, vcc_lo
+; GFX1132-NEXT: v_sub_co_ci_u32_e64 v1, null, s4, v5, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
entry:
@@ -5672,9 +5671,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -5721,9 +5720,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -6089,42 +6088,42 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6173,7 +6172,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_sub_co_u32 v7, vcc, s3, v9
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v8, vcc, s4, v10, vcc
+; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e64 v8, null, s4, v10, vcc
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1164_DPP-NEXT: s_endpgm
@@ -6191,26 +6190,26 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc_lo, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -6219,7 +6218,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6251,7 +6250,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s3, v10
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s4, v11, vcc_lo
+; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e64 v9, null, s4, v11, vcc_lo
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0
; GFX1132_DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 2ef88010bd157..c8d46c7feb91d 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -32016,8 +32016,8 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v3
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi bfloat %x to i64
ret i64 %op
@@ -32210,11 +32210,12 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX11-NEXT: v_xor_b32_e32 v4, v4, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v1
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v1, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <2 x bfloat> %x to <2 x i64>
ret <2 x i64> %op
@@ -32473,19 +32474,21 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) {
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT: v_xor_b32_e32 v3, v3, v5
; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX11-NEXT: v_xor_b32_e32 v3, v3, v5
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v5
; GFX11-NEXT: v_xor_b32_e32 v9, v0, v7
; GFX11-NEXT: v_xor_b32_e32 v4, v4, v7
; GFX11-NEXT: v_xor_b32_e32 v10, v1, v8
; GFX11-NEXT: v_xor_b32_e32 v6, v6, v8
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v3, v5, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v9, v7
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v4, v7, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v10, v8
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v6, v8, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <3 x bfloat> %x to <3 x i64>
ret <3 x i64> %op
@@ -32791,14 +32794,14 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v2|
; GFX11-NEXT: v_mul_f32_e64 v6, 0x2f800000, |v0|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_mul_f32_e64 v8, 0x2f800000, |v3|
-; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; GFX11-NEXT: v_mul_f32_e64 v9, 0x2f800000, |v4|
+; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; GFX11-NEXT: v_floor_f32_e32 v1, v1
; GFX11-NEXT: v_floor_f32_e32 v6, v6
-; GFX11-NEXT: v_floor_f32_e32 v8, v8
; GFX11-NEXT: v_ashrrev_i32_e32 v7, 31, v0
+; GFX11-NEXT: v_floor_f32_e32 v8, v8
; GFX11-NEXT: v_floor_f32_e32 v9, v9
; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v2|
; GFX11-NEXT: v_fma_f32 v0, 0xcf800000, v6, |v0|
@@ -32816,21 +32819,22 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
; GFX11-NEXT: v_cvt_u32_f32_e32 v8, v8
; GFX11-NEXT: v_xor_b32_e32 v6, v6, v7
; GFX11-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
; GFX11-NEXT: v_ashrrev_i32_e32 v13, 31, v4
+; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v5
; GFX11-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX11-NEXT: v_xor_b32_e32 v4, v12, v10
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v3, v7
+; GFX11-NEXT: v_xor_b32_e32 v4, v12, v10
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v6, v7, vcc_lo
; GFX11-NEXT: v_xor_b32_e32 v5, v8, v10
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v6, v7, vcc_lo
; GFX11-NEXT: v_xor_b32_e32 v6, v11, v13
; GFX11-NEXT: v_xor_b32_e32 v7, v9, v13
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v10
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v10, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v5, v10, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v6, vcc_lo, v6, v13
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, null, v7, v13, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fptosi <4 x bfloat> %x to <4 x i64>
ret <4 x i64> %op
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index dac78a727d72c..bf8b173c53e1c 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -2380,14 +2380,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: v_mul_hi_u32 v8, v1, v2
; GFX1030W32-NEXT: v_mul_lo_u32 v2, v1, v2
; GFX1030W32-NEXT: v_add_co_u32 v4, vcc_lo, v4, v5
-; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
; GFX1030W32-NEXT: v_add_co_u32 v3, vcc_lo, v4, v3
; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo
; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v8, vcc_lo
; GFX1030W32-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2
-; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v4, vcc_lo
; GFX1030W32-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030W32-NEXT: v_mul_hi_u32 v2, s0, v0
; GFX1030W32-NEXT: v_mul_lo_u32 v4, s1, v0
; GFX1030W32-NEXT: v_mul_lo_u32 v3, s0, v1
@@ -2402,14 +2402,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: v_mul_hi_u32 v8, v1, v2
; GFX1030W32-NEXT: v_mul_lo_u32 v2, v1, v2
; GFX1030W32-NEXT: v_add_co_u32 v4, vcc_lo, v4, v5
-; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
; GFX1030W32-NEXT: v_add_co_u32 v3, vcc_lo, v4, v3
; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo
; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v8, vcc_lo
; GFX1030W32-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2
-; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v4, vcc_lo
; GFX1030W32-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030W32-NEXT: v_mul_hi_u32 v2, s10, v0
; GFX1030W32-NEXT: v_mul_hi_u32 v5, s11, v0
; GFX1030W32-NEXT: v_mul_lo_u32 v3, s10, v1
@@ -2418,27 +2418,27 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: v_mul_hi_u32 v6, s11, v1
; GFX1030W32-NEXT: v_mul_lo_u32 v1, s11, v1
; GFX1030W32-NEXT: v_add_co_u32 v2, vcc_lo, v2, v3
-; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v4, vcc_lo
; GFX1030W32-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v3, v5, vcc_lo
; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v6, vcc_lo
; GFX1030W32-NEXT: v_add_co_u32 v0, vcc_lo, v0, v1
-; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
; GFX1030W32-NEXT: v_mul_hi_u32 v2, s2, v0
; GFX1030W32-NEXT: v_mul_lo_u32 v4, s3, v0
; GFX1030W32-NEXT: v_mul_lo_u32 v3, s2, v1
+; GFX1030W32-NEXT: v_add_co_u32 v5, s0, v0, 1
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v1, s0
; GFX1030W32-NEXT: v_add_nc_u32_e32 v2, v2, v3
; GFX1030W32-NEXT: v_mul_lo_u32 v3, s2, v0
; GFX1030W32-NEXT: v_add_nc_u32_e32 v2, v2, v4
; GFX1030W32-NEXT: v_sub_co_u32 v3, vcc_lo, s10, v3
; GFX1030W32-NEXT: v_sub_nc_u32_e32 v4, s11, v2
-; GFX1030W32-NEXT: v_subrev_co_ci_u32_e64 v4, s0, s3, v4, vcc_lo
-; GFX1030W32-NEXT: v_add_co_u32 v5, s0, v0, 1
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v1, s0
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v2, null, s11, v2, vcc_lo
; GFX1030W32-NEXT: v_sub_co_u32 v7, s0, v3, s2
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, s11, v2, vcc_lo
-; GFX1030W32-NEXT: v_subrev_co_ci_u32_e64 v4, s0, 0, v4, s0
+; GFX1030W32-NEXT: v_subrev_co_ci_u32_e64 v4, null, s3, v4, vcc_lo
; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v7
+; GFX1030W32-NEXT: v_subrev_co_ci_u32_e64 v4, null, 0, v4, s0
; GFX1030W32-NEXT: v_cmp_eq_u32_e64 s0, s3, v2
; GFX1030W32-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo
; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v4
@@ -2451,7 +2451,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, v9, v3, s0
; GFX1030W32-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc_lo
; GFX1030W32-NEXT: v_add_co_u32 v7, vcc_lo, v0, 2
-; GFX1030W32-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v1, vcc_lo
; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX1030W32-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc_lo
; GFX1030W32-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
@@ -2526,14 +2526,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_mul_hi_u32 v8, v1, v2
; GFX1030W64-NEXT: v_mul_lo_u32 v2, v1, v2
; GFX1030W64-NEXT: v_add_co_u32 v4, vcc, v4, v5
-; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v5, vcc, 0, v6, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc
; GFX1030W64-NEXT: v_add_co_u32 v3, vcc, v4, v3
; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v7, vcc
; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v4, vcc, 0, v8, vcc
; GFX1030W64-NEXT: v_add_co_u32 v2, vcc, v3, v2
-; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v4, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v4, vcc
; GFX1030W64-NEXT: v_add_co_u32 v0, vcc, v0, v2
-; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1030W64-NEXT: v_mul_hi_u32 v2, s0, v0
; GFX1030W64-NEXT: v_mul_lo_u32 v4, s1, v0
; GFX1030W64-NEXT: v_mul_lo_u32 v3, s0, v1
@@ -2548,14 +2548,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_mul_hi_u32 v8, v1, v2
; GFX1030W64-NEXT: v_mul_lo_u32 v2, v1, v2
; GFX1030W64-NEXT: v_add_co_u32 v4, vcc, v4, v5
-; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v5, vcc, 0, v6, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc
; GFX1030W64-NEXT: v_add_co_u32 v3, vcc, v4, v3
; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v7, vcc
; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v4, vcc, 0, v8, vcc
; GFX1030W64-NEXT: v_add_co_u32 v2, vcc, v3, v2
-; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v4, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v4, vcc
; GFX1030W64-NEXT: v_add_co_u32 v0, vcc, v0, v2
-; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1030W64-NEXT: v_mul_hi_u32 v2, s10, v0
; GFX1030W64-NEXT: v_mul_hi_u32 v5, s11, v0
; GFX1030W64-NEXT: v_mul_lo_u32 v3, s10, v1
@@ -2564,27 +2564,27 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_mul_hi_u32 v6, s11, v1
; GFX1030W64-NEXT: v_mul_lo_u32 v1, s11, v1
; GFX1030W64-NEXT: v_add_co_u32 v2, vcc, v2, v3
-; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v4, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v4, vcc
; GFX1030W64-NEXT: v_add_co_u32 v0, vcc, v2, v0
; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v0, vcc, v3, v5, vcc
; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, 0, v6, vcc
; GFX1030W64-NEXT: v_add_co_u32 v0, vcc, v0, v1
-; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc
; GFX1030W64-NEXT: v_mul_hi_u32 v2, s2, v0
; GFX1030W64-NEXT: v_mul_lo_u32 v4, s3, v0
; GFX1030W64-NEXT: v_mul_lo_u32 v3, s2, v1
+; GFX1030W64-NEXT: v_add_co_u32 v5, s[0:1], v0, 1
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v1, s[0:1]
; GFX1030W64-NEXT: v_add_nc_u32_e32 v2, v2, v3
; GFX1030W64-NEXT: v_mul_lo_u32 v3, s2, v0
; GFX1030W64-NEXT: v_add_nc_u32_e32 v2, v2, v4
; GFX1030W64-NEXT: v_sub_co_u32 v3, vcc, s10, v3
; GFX1030W64-NEXT: v_sub_nc_u32_e32 v4, s11, v2
-; GFX1030W64-NEXT: v_subrev_co_ci_u32_e64 v4, s[0:1], s3, v4, vcc
-; GFX1030W64-NEXT: v_add_co_u32 v5, s[0:1], v0, 1
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v2, null, s11, v2, vcc
; GFX1030W64-NEXT: v_sub_co_u32 v7, s[0:1], v3, s2
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e32 v2, vcc, s11, v2, vcc
-; GFX1030W64-NEXT: v_subrev_co_ci_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX1030W64-NEXT: v_subrev_co_ci_u32_e64 v4, null, s3, v4, vcc
; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v7
+; GFX1030W64-NEXT: v_subrev_co_ci_u32_e64 v4, null, 0, v4, s[0:1]
; GFX1030W64-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v2
; GFX1030W64-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s3, v4
@@ -2597,7 +2597,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[0:1]
; GFX1030W64-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc
; GFX1030W64-NEXT: v_add_co_u32 v7, vcc, v0, 2
-; GFX1030W64-NEXT: v_add_co_ci_u32_e32 v8, vcc, 0, v1, vcc
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v1, vcc
; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX1030W64-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
; GFX1030W64-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc
@@ -2679,27 +2679,28 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_mul_hi_u32 v8, v1, v2
; GFX11-NEXT: v_mul_lo_u32 v2, v1, v2
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v5
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v4, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: v_mul_hi_u32 v2, s0, v0
; GFX11-NEXT: v_mul_lo_u32 v4, s1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_lo_u32 v3, s0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
; GFX11-NEXT: v_mul_lo_u32 v3, s0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
; GFX11-NEXT: v_mul_hi_u32 v4, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_mul_lo_u32 v5, v0, v2
; GFX11-NEXT: v_mul_hi_u32 v6, v0, v2
; GFX11-NEXT: v_mul_hi_u32 v7, v1, v3
@@ -2707,57 +2708,59 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_mul_hi_u32 v8, v1, v2
; GFX11-NEXT: v_mul_lo_u32 v2, v1, v2
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v5
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v7, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v4, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: v_mul_hi_u32 v2, s10, v0
; GFX11-NEXT: v_mul_hi_u32 v5, s11, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX11-NEXT: v_mul_hi_u32 v4, s10, v1
; GFX11-NEXT: v_mul_lo_u32 v0, s11, v0
; GFX11-NEXT: v_mul_hi_u32 v6, s11, v1
; GFX11-NEXT: v_mul_lo_u32 v1, s11, v1
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v4, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v3, v5, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v1
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
; GFX11-NEXT: v_mul_hi_u32 v2, s2, v0
; GFX11-NEXT: v_mul_lo_u32 v4, s3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_lo_u32 v3, s2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v5, s0, v0, 1
+; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
; GFX11-NEXT: v_mul_lo_u32 v3, s2, v0
; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, s10, v3
; GFX11-NEXT: v_sub_nc_u32_e32 v4, s11, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_subrev_co_ci_u32_e64 v4, s0, s3, v4, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v5, s0, v0, 1
-; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, 0, v1, s0
-; GFX11-NEXT: v_sub_co_u32 v7, s0, v3, s2
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, s11, v2, vcc_lo
-; GFX11-NEXT: v_subrev_co_ci_u32_e64 v4, s0, 0, v4, s0
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v2, null, s11, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sub_co_u32 v7, s0, v3, s2
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v4, null, s3, v4, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v7
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v4, null, 0, v4, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v4
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v3
@@ -2765,15 +2768,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v2, v9, v3, s0
; GFX11-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc_lo
; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v0, 2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v3, v5, v7 :: v_dual_cndmask_b32 v4, v6, v8
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_cndmask_b32 v0, v0, v3
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
; GFX11-NEXT: s_cbranch_vccnz .LBB16_3
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 3305cac0d7ea6..890f4f77ed107 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -140,7 +140,7 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) {
; GFX10-NEXT: .LBB0_2: ; %endif
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3d0800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v5 offset:252
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -297,7 +297,7 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in,
; GFX10-NEXT: .LBB1_2: ; %endif
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3d0800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v5 offset:252
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -410,7 +410,7 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in
; GFX10-NEXT: .LBB2_2: ; %endif
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3d0800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v5 offset:252
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -577,12 +577,12 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
; GFX10-NEXT: s_cbranch_execz .LBB3_2
; GFX10-NEXT: ; %bb.1: ; %if
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX10-NEXT: flat_load_sbyte v4, v[2:3] offset:2047
; GFX10-NEXT: .LBB3_2: ; %endif
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -700,12 +700,12 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
; GFX10-NEXT: s_cbranch_execz .LBB4_2
; GFX10-NEXT: ; %bb.1: ; %if
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX10-NEXT: flat_load_sbyte v4, v[2:3]
; GFX10-NEXT: .LBB4_2: ; %endif
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x61800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v4 offset:636
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -823,12 +823,12 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
; GFX10-NEXT: s_cbranch_execz .LBB5_2
; GFX10-NEXT: ; %bb.1: ; %if
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
; GFX10-NEXT: flat_load_sbyte v6, v[2:3]
; GFX10-NEXT: .LBB5_2: ; %endif
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v6
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
index 0c5b67580c352..54cbc25043db3 100644
--- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
@@ -82,11 +82,11 @@ define noundef i64 @srem64_3(i64 noundef %i) {
; GFX1030-NEXT: v_add3_u32 v3, v4, v3, v5
; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v3, vcc_lo
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 3, 0
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 3, v[3:4]
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = srem i64 %i, 3
@@ -169,11 +169,11 @@ define noundef i64 @srem64_6(i64 noundef %i) {
; GFX1030-NEXT: v_add3_u32 v3, v4, v3, v5
; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 31, v3
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v3, vcc_lo
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 3, 0
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 3, v[3:4]
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = srem i64 %i, 3
@@ -245,7 +245,7 @@ define noundef i64 @urem64_3(i64 noundef %i) {
; GFX1030-NEXT: v_mov_b32_e32 v2, v5
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 3, v[2:3]
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = urem i64 %i, 3
@@ -317,7 +317,7 @@ define noundef i64 @urem64_6(i64 noundef %i) {
; GFX1030-NEXT: v_mov_b32_e32 v2, v5
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 6, v[2:3]
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = urem i64 %i, 6
@@ -390,7 +390,7 @@ define noundef i64 @sdiv64_3(i64 noundef %i) {
; GFX1030-NEXT: v_add3_u32 v1, v5, v1, v4
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v1
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%div = sdiv i64 %i, 3
@@ -463,7 +463,7 @@ define noundef i64 @sdiv64_6(i64 noundef %i) {
; GFX1030-NEXT: v_add3_u32 v1, v5, v1, v4
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v1
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%div = sdiv i64 %i, 3
@@ -611,10 +611,10 @@ define noundef i64 @srem64_2(i64 noundef %i) {
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v1
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_and_b32_e32 v2, -2, v2
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = srem i64 %i, 2
@@ -645,7 +645,7 @@ define noundef i64 @sdiv64_2(i64 noundef %i) {
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v1
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1]
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -737,10 +737,10 @@ define noundef i64 @srem64_64(i64 noundef %i) {
; GFX1030-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 26, v2
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_and_b32_e32 v2, 0xffffffc0, v2
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = srem i64 %i, 64
@@ -774,7 +774,7 @@ define noundef i64 @sdiv64_64(i64 noundef %i) {
; GFX1030-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 26, v2
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_ashrrev_i64 v[0:1], 6, v[0:1]
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -866,10 +866,10 @@ define noundef i64 @srem64_i32min(i64 noundef %i) {
; GFX1030-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_and_b32_e32 v2, 0x80000000, v2
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = srem i64 %i, -2147483648
@@ -908,10 +908,10 @@ define noundef i64 @sdiv64_i32min(i64 noundef %i) {
; GFX1030-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 1, v2
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v0
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%div = sdiv i64 %i, -2147483648
@@ -954,7 +954,7 @@ define noundef i64 @urem64_i32min(i64 noundef %i) {
; GFX1030-NEXT: v_add_lshl_u32 v2, v2, v4, 30
; GFX1030-NEXT: v_and_b32_e32 v2, 0x80000000, v2
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = urem i64 %i, -2147483648
@@ -1096,17 +1096,17 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
; GFX1030-NEXT: v_sub_nc_u32_e32 v4, v4, v0
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x80000001, v1, v[2:3]
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, 1, v[2:3]
; GFX1030-NEXT: v_add_nc_u32_e32 v3, v1, v3
; GFX1030-NEXT: v_ashrrev_i64 v[4:5], 30, v[2:3]
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v3
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v5, vcc_lo
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x7fffffff, v2, 0
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x7fffffff, v4, v[3:4]
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = srem i64 %i, 2147483647
@@ -1191,13 +1191,13 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
; GFX1030-NEXT: v_sub_nc_u32_e32 v4, v4, v0
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x80000001, v1, v[2:3]
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, 1, v[2:3]
; GFX1030-NEXT: v_add_nc_u32_e32 v3, v1, v3
; GFX1030-NEXT: v_ashrrev_i64 v[0:1], 30, v[2:3]
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v3
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%div = sdiv i64 %i, 2147483647
@@ -1271,16 +1271,16 @@ define noundef i64 @urem64_i32max(i64 noundef %i) {
; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v1, 2, v[2:3]
; GFX1030-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX1030-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v3, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v4, null, v5, v3, vcc_lo
; GFX1030-NEXT: v_alignbit_b32 v2, v4, v2, 30
; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 30, v4
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x7fffffff, v2, 0
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x7fffffff, v4, v[3:4]
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%rem = urem i64 %i, 2147483647
@@ -1342,10 +1342,10 @@ define noundef i64 @udiv64_i32max(i64 noundef %i) {
; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v1, 2, v[2:3]
; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX1030-NEXT: v_alignbit_b32 v0, v1, v0, 30
; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; GFX1030-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
index 34b794705e983..402a294322273 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX942
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11
; GCN-LABEL: {{^}}dpp64_ceil:
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
@@ -77,7 +77,8 @@ define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) {
; GFX942: v_lshl_add_u64
; GFX10PLUS: v_mov_b32_dpp
; GFX10PLUS: v_add_co_u32
-; GFX10PLUS: v_add_co_ci_u32_e32
+; GFX10: v_add_co_ci_u32_e32
+; GFX11: v_add_co_ci_u32_e64
define amdgpu_cs void @dpp64_loop(i64 %arg, i64 %val) {
bb:
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index c713c48c92457..49cbc75d65a85 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -404,7 +404,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1021,7 +1022,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -2714,7 +2716,8 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3331,7 +3334,8 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -6064,12 +6068,11 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_3
@@ -6080,7 +6083,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB31_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB31_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5]
@@ -6161,11 +6163,11 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB31_3
@@ -6485,12 +6487,11 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_3
@@ -6501,7 +6502,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5]
@@ -6583,11 +6583,11 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB32_3
@@ -7297,11 +7297,10 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_3
@@ -7312,7 +7311,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
@@ -7391,10 +7389,10 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB34_3
@@ -7708,11 +7706,10 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_3
@@ -7723,7 +7720,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB35_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB35_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
@@ -7803,10 +7799,10 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB35_3
@@ -8412,7 +8408,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8487,15 +8483,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8710,7 +8706,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8786,15 +8782,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9286,7 +9282,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9358,15 +9354,15 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9573,7 +9569,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9646,15 +9642,15 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10297,7 +10293,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -10373,15 +10369,15 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10598,7 +10594,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -10671,15 +10667,15 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11234,7 +11230,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -11328,16 +11324,16 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
@@ -11588,7 +11584,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -11683,16 +11679,16 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
@@ -11943,7 +11939,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -12034,16 +12030,16 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
@@ -12286,7 +12282,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -12378,16 +12374,16 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
@@ -13523,7 +13519,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -13618,16 +13614,16 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
@@ -13880,7 +13876,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -13972,16 +13968,16 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
@@ -14628,12 +14624,12 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15196,9 +15192,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
@@ -17069,12 +17066,12 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
@@ -17901,9 +17898,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 71abe6f32e81e..f72d1b83ca50f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -352,7 +352,8 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -800,7 +801,8 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1894,7 +1896,8 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2342,7 +2345,8 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -3162,12 +3166,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_3
@@ -3178,7 +3181,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5]
@@ -3265,11 +3267,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB19_3
@@ -3575,12 +3577,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_3
@@ -3591,7 +3592,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5]
@@ -3679,11 +3679,11 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB20_3
@@ -4374,11 +4374,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_3
@@ -4389,7 +4388,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7]
@@ -4475,10 +4473,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB22_3
@@ -4780,11 +4778,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_3
@@ -4795,7 +4792,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7]
@@ -4882,10 +4878,10 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB23_3
@@ -6308,7 +6304,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6387,16 +6383,16 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6620,7 +6616,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6700,16 +6696,16 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7226,7 +7222,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -7303,16 +7299,16 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7529,7 +7525,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -7607,16 +7603,16 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8299,7 +8295,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8379,16 +8375,16 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8614,7 +8610,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -8692,16 +8688,16 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9267,7 +9263,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9361,16 +9357,16 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
@@ -9622,7 +9618,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9717,16 +9713,16 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
@@ -10310,7 +10306,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -10401,16 +10397,16 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
@@ -10654,7 +10650,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -10746,16 +10742,16 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
@@ -11563,7 +11559,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -11658,16 +11654,16 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
@@ -11921,7 +11917,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -12013,16 +12009,16 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
@@ -12806,12 +12802,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_pk_max_f16 v1, v2, v2
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -13517,9 +13513,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
@@ -14983,12 +14980,12 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
@@ -16022,9 +16019,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 49c4b9000d8b5..0213ca43a2d81 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -352,7 +352,8 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -800,7 +801,8 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -1894,7 +1896,8 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2342,7 +2345,8 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -3162,12 +3166,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_3
@@ -3178,7 +3181,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5]
@@ -3265,11 +3267,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB19_3
@@ -3575,12 +3577,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_3
@@ -3591,7 +3592,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5]
@@ -3679,11 +3679,11 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB20_3
@@ -4374,11 +4374,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_3
@@ -4389,7 +4388,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7]
@@ -4475,10 +4473,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB22_3
@@ -4780,11 +4778,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_3
@@ -4795,7 +4792,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7]
@@ -4882,10 +4878,10 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB23_3
@@ -6308,7 +6304,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6387,16 +6383,16 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6620,7 +6616,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6700,16 +6696,16 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7226,7 +7222,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -7303,16 +7299,16 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7529,7 +7525,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -7607,16 +7603,16 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8299,7 +8295,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8379,16 +8375,16 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8614,7 +8610,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -8692,16 +8688,16 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9267,7 +9263,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9361,16 +9357,16 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
@@ -9622,7 +9618,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9717,16 +9713,16 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
@@ -10310,7 +10306,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -10401,16 +10397,16 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
@@ -10654,7 +10650,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -10746,16 +10742,16 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
@@ -11563,7 +11559,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -11658,16 +11654,16 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
@@ -11921,7 +11917,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -12013,16 +12009,16 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
@@ -12806,12 +12802,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_pk_max_f16 v1, v2, v2
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -13517,9 +13513,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
@@ -14983,12 +14980,12 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
@@ -16022,9 +16019,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index a6f8880d6d6f8..81a9528ca6aa3 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -472,12 +472,12 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1061,9 +1061,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
@@ -2071,12 +2072,12 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2660,9 +2661,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
@@ -3635,12 +3637,11 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_3
@@ -3651,7 +3652,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: .LBB17_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB17_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5]
@@ -3745,11 +3745,11 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB17_3
@@ -4082,12 +4082,11 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_3
@@ -4098,7 +4097,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5]
@@ -4193,11 +4191,11 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_3
@@ -4946,11 +4944,10 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_3
@@ -4961,7 +4958,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
@@ -5053,10 +5049,10 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB20_3
@@ -5383,11 +5379,10 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_3
@@ -5398,7 +5393,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global
; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
@@ -5491,10 +5485,10 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB21_3
@@ -6113,7 +6107,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6188,15 +6182,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6411,7 +6405,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6487,15 +6481,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6987,7 +6981,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7059,15 +7053,15 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7274,7 +7268,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7347,15 +7341,15 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7998,7 +7992,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8074,15 +8068,15 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8299,7 +8293,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8372,15 +8366,15 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8935,7 +8929,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9029,16 +9023,16 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
@@ -9289,7 +9283,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9384,16 +9378,16 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
@@ -9975,7 +9969,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -10066,16 +10060,16 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
@@ -10318,7 +10312,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -10410,16 +10404,16 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
@@ -11224,7 +11218,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -11319,16 +11313,16 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
@@ -11581,7 +11575,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -11673,16 +11667,16 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
@@ -12426,12 +12420,12 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13081,9 +13075,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
@@ -14498,12 +14493,12 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
@@ -15537,9 +15532,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 07c9521e7646a..fa4b0f4313353 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -133,7 +133,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -275,7 +276,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB1_4: ; %atomicrmw.end
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -431,7 +433,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -579,7 +582,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB3_4: ; %atomicrmw.end
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -718,7 +722,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -854,7 +859,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB5_4: ; %atomicrmw.end
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -1003,7 +1009,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -1145,7 +1152,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB7_4: ; %atomicrmw.end
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -2427,7 +2435,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -2569,7 +2578,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB17_4: ; %atomicrmw.end
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -2725,7 +2735,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -2873,7 +2884,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB19_4: ; %atomicrmw.end
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -3012,7 +3024,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -3148,7 +3161,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB21_4: ; %atomicrmw.end
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -3297,7 +3311,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v1, null, s3, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -3439,7 +3454,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB23_4: ; %atomicrmw.end
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
@@ -14074,10 +14090,10 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
@@ -14224,10 +14240,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB108_4: ; %atomicrmw.end
@@ -14388,10 +14404,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
@@ -14544,10 +14560,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB110_4: ; %atomicrmw.end
@@ -14691,10 +14707,10 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
@@ -14835,10 +14851,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB112_4: ; %atomicrmw.end
@@ -14992,10 +15008,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
@@ -15142,10 +15158,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0
; GFX12-NEXT: .LBB114_4: ; %atomicrmw.end
@@ -15302,7 +15318,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -15463,7 +15479,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1]
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
@@ -15636,7 +15652,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -15803,7 +15819,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1]
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
@@ -15959,7 +15975,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -16114,7 +16130,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1]
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
@@ -16280,7 +16296,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1]
; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -16441,7 +16457,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1]
; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, s1
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index cdb31534674de..f199db3ca12ca 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -5314,7 +5314,8 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 1, v6
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v5
; GFX11-NEXT: s_cbranch_execz .LBB81_2
; GFX11-NEXT: ; %bb.1: ; %if
diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
index 88cc4b1c96b4a..0959687d3834c 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
@@ -36,7 +36,7 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -57,8 +57,8 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -75,10 +75,9 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
%idx = add nsw i32 %i, 3
%arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 2eb35977b8160..2af12d150154a 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1165,10 +1165,10 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 20, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 20, v2
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x3ff00000, v1, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, 0, 0
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, 0x3ff00000, v3, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x double>
@@ -1660,8 +1660,8 @@ define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 20, v0
; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x36a00000, v0, vcc_lo
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i32 1, %cnt
diff --git a/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll b/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
index 08799e7e26bdf..f4163089de5de 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx10-vop-literal.ll
@@ -1,9 +1,10 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11 %s
; GCN-LABEL: {{^}}test_add_lit:
-; GFX10: v_add_co_u32{{(_e64)?}} v{{[0-9]+}}, vcc_lo, 0x80992bff, v{{[0-9]+}}
+; GFX10PLUS: v_add_co_u32{{(_e64)?}} v{{[0-9]+}}, vcc_lo, 0x80992bff, v{{[0-9]+}}
+; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, 0xe7, v{{[0-9]+}}, vcc_lo
; GFX10: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0xe7, v{{[0-9]+}}, vcc_lo
; GFX9: v_mov_b32_e32 [[C2:v[0-9]+]], 0xe7
; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x80992bff, v{{[0-9]+}}
@@ -18,7 +19,7 @@ define amdgpu_kernel void @test_add_lit(ptr addrspace(1) %p) {
}
; GCN-LABEL: {{^}}test_cndmask_lit:
-; GFX10: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3039, v{{[0-9]+}}, vcc_lo
+; GFX10PLUS: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3039, v{{[0-9]+}}, vcc_lo
; GFX9: v_mov_b32_e32 [[C:v[0-9]+]], 0x3039
; GFX9: v_cndmask_b32_e32 v{{[0-9]+}}, [[C]], v{{[0-9]+}}, vcc
define amdgpu_kernel void @test_cndmask_lit(ptr addrspace(1) %p) {
@@ -35,8 +36,8 @@ define amdgpu_kernel void @test_cndmask_lit(ptr addrspace(1) %p) {
}
; GCN-LABEL: {{^}}test_bfe_2lit_s:
-; GFX10: v_mov_b32_e32 [[C1:v[0-9]+]], 0xddd5
-; GFX10: v_bfe_u32 v{{[0-9]+}}, 0x3039, s{{[0-9]+}}, [[C1]]
+; GFX10PLUS: v_mov_b32_e32 [[C1:v[0-9]+]], 0xddd5
+; GFX10PLUS: v_bfe_u32 v{{[0-9]+}}, 0x3039, s{{[0-9]+}}, [[C1]]
; GFX9-DAG: v_mov_b32_e32 [[C2:v[0-9]+]], 0xddd5
; GFX9-DAG: s_movk_i32 [[C1:s[0-9]+]], 0x3039
; GFX9: v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, [[C2]]
@@ -47,8 +48,8 @@ define amdgpu_kernel void @test_bfe_2lit_s(ptr addrspace(1) %p, i32 %src) {
}
; GCN-LABEL: {{^}}test_bfe_2lit_v:
-; GFX10: s_movk_i32 [[C1:s[0-9]+]], 0x3039
-; GFX10: v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, 0xddd5
+; GFX10PLUS: s_movk_i32 [[C1:s[0-9]+]], 0x3039
+; GFX10PLUS: v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, 0xddd5
; GFX9-DAG: v_mov_b32_e32 [[C2:v[0-9]+]], 0xddd5
; GFX9-DAG: s_movk_i32 [[C1:s[0-9]+]], 0x3039
; GFX9: v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, [[C2]]
@@ -62,4 +63,4 @@ define amdgpu_kernel void @test_bfe_2lit_v(ptr addrspace(1) %p) {
}
declare i32 @llvm.amdgcn.workitem.id.x()
-declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32)
+declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32)
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
index 61ae9639c52d0..737985c27c5d3 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
@@ -272,8 +272,8 @@ define amdgpu_ps void @test_s_load_i16_divergent(ptr addrspace(4) inreg %in, i32
; DAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; DAG-NEXT: v_lshlrev_b64_e32 v[3:4], 1, v[3:4]
; DAG-NEXT: v_add_co_u32 v3, vcc_lo, s0, v3
-; DAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; DAG-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s1, v4, vcc_lo
+; DAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAG-NEXT: v_add_co_ci_u32_e64 v4, null, s1, v4, vcc_lo
; DAG-NEXT: global_load_i16 v0, v[3:4], off offset:32
; DAG-NEXT: s_wait_loadcnt 0x0
; DAG-NEXT: global_store_b32 v[1:2], v0, off
@@ -287,8 +287,8 @@ define amdgpu_ps void @test_s_load_i16_divergent(ptr addrspace(4) inreg %in, i32
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 1, v[0:1]
; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v6, v1, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v6, v1, vcc_lo
; GISEL-NEXT: global_load_i16 v0, v[0:1], off offset:32
; GISEL-NEXT: s_wait_loadcnt 0x0
; GISEL-NEXT: global_store_b32 v[3:4], v0, off
@@ -388,8 +388,8 @@ define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32
; DAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; DAG-NEXT: v_lshlrev_b64_e32 v[3:4], 1, v[3:4]
; DAG-NEXT: v_add_co_u32 v3, vcc_lo, s0, v3
-; DAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; DAG-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s1, v4, vcc_lo
+; DAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAG-NEXT: v_add_co_ci_u32_e64 v4, null, s1, v4, vcc_lo
; DAG-NEXT: global_load_u16 v0, v[3:4], off offset:32
; DAG-NEXT: s_wait_loadcnt 0x0
; DAG-NEXT: global_store_b32 v[1:2], v0, off
@@ -403,8 +403,8 @@ define amdgpu_ps void @test_s_load_u16_divergent(ptr addrspace(4) inreg %in, i32
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 1, v[0:1]
; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v6, v1, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v6, v1, vcc_lo
; GISEL-NEXT: global_load_u16 v0, v[0:1], off offset:32
; GISEL-NEXT: s_wait_loadcnt 0x0
; GISEL-NEXT: global_store_b32 v[3:4], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index a466e9f6f6106..5559e75b17cd6 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -8555,7 +8555,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8630,15 +8630,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8905,7 +8905,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8981,15 +8981,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9582,7 +9582,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9654,15 +9654,15 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v4, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9919,7 +9919,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9992,15 +9992,15 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v4, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -10769,7 +10769,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -10845,15 +10845,15 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11122,7 +11122,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -11195,15 +11195,15 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v4, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11858,7 +11858,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -11952,16 +11952,16 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
@@ -12264,7 +12264,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -12359,16 +12359,16 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
@@ -13051,7 +13051,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -13142,16 +13142,16 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
@@ -13444,7 +13444,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -13536,16 +13536,16 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
@@ -14476,7 +14476,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -14571,16 +14571,16 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start
@@ -14885,7 +14885,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -14977,16 +14977,16 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index a1f5a0289172f..05bf93d83c01d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -4778,7 +4778,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -4857,16 +4857,16 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5142,7 +5142,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -5222,16 +5222,16 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5849,7 +5849,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -5926,16 +5926,16 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6202,7 +6202,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -6280,16 +6280,16 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7098,7 +7098,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7178,16 +7178,16 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7465,7 +7465,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -7543,16 +7543,16 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8219,7 +8219,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8313,16 +8313,16 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
@@ -8627,7 +8627,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8722,16 +8722,16 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
@@ -9418,7 +9418,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -9509,16 +9509,16 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
@@ -9813,7 +9813,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -9905,16 +9905,16 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
@@ -10851,7 +10851,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -10946,16 +10946,16 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
@@ -11262,7 +11262,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -11354,16 +11354,16 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index b026ed6250ce4..60d2b5c0954f7 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -4778,7 +4778,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -4857,16 +4857,16 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5142,7 +5142,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -5222,16 +5222,16 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5849,7 +5849,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -5926,16 +5926,16 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6202,7 +6202,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -6280,16 +6280,16 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7098,7 +7098,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7178,16 +7178,16 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7465,7 +7465,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -7543,16 +7543,16 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8219,7 +8219,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8313,16 +8313,16 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
@@ -8627,7 +8627,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8722,16 +8722,16 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
@@ -9418,7 +9418,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -9509,16 +9509,16 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
@@ -9813,7 +9813,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -9905,16 +9905,16 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
@@ -10851,7 +10851,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -10946,16 +10946,16 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
@@ -11262,7 +11262,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -11354,16 +11354,16 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index fa619f97256bd..0c955efe83e37 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -5523,7 +5523,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -5598,15 +5598,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -5873,7 +5873,7 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -5949,15 +5949,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6550,7 +6550,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6622,15 +6622,15 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v4, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -6887,7 +6887,7 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6960,15 +6960,15 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v4, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7737,7 +7737,7 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7813,15 +7813,15 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8090,7 +8090,7 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8163,15 +8163,15 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: global_load_b32 v4, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8826,7 +8826,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -8920,16 +8920,16 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
@@ -9232,7 +9232,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -9327,16 +9327,16 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
@@ -10019,7 +10019,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -10110,16 +10110,16 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
@@ -10412,7 +10412,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -10504,16 +10504,16 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
@@ -11444,7 +11444,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
@@ -11539,16 +11539,16 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
@@ -11853,7 +11853,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
@@ -11945,16 +11945,16 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index f096c1752b84a..d9fea14ef4aa9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -1122,8 +1122,8 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
@@ -1209,8 +1209,8 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrsp
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
@@ -1390,8 +1390,8 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF(ptr addrs
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x7ff000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
@@ -1438,8 +1438,8 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF(ptr addrs
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xff800000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
@@ -1673,10 +1673,10 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
; GFX12-GISEL-NEXT: ds_load_b64 v[1:2], v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v1, v0
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: ; return to shader part epilog
%sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
@@ -1740,10 +1740,10 @@ define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %vo
; GFX12-GISEL-NEXT: ds_load_b64 v[1:2], v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v1, v0
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v2, vcc
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:42
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: ; return to shader part epilog
%sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
@@ -1919,7 +1919,8 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3
; GFX11-LABEL: global_load_i8_vgpr64_sgpr32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
@@ -1927,10 +1928,10 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3
; GFX12-SDAG-LABEL: global_load_i8_vgpr64_sgpr32:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, v0, s2
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32:
@@ -1938,12 +1939,11 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i3
; GFX12-GISEL-NEXT: s_mov_b32 s3, 0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v0, v2
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
@@ -1976,7 +1976,8 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1
; GFX11-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
@@ -1984,10 +1985,10 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1
; GFX12-SDAG-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, v0, s2
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
@@ -1995,12 +1996,11 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1
; GFX12-GISEL-NEXT: s_mov_b32 s3, 0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v0, v2
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
@@ -2050,8 +2050,8 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
@@ -2064,11 +2064,10 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_saddr_f32_natural_addressing:
@@ -2081,11 +2080,10 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v2, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: ; return to shader part epilog
%voffset = load i32, ptr addrspace(1) %voffset.ptr
%zext.offset = zext i32 %voffset to i64
@@ -2233,8 +2231,8 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
@@ -2247,11 +2245,10 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc, s2, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc
; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
@@ -2264,11 +2261,10 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addr
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc, v2, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc, v3, v1, vcc
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc
; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: ; return to shader part epilog
%voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{}
%zext.offset = zext i32 %voffset to i64
@@ -4753,10 +4749,10 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc
; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2
@@ -4873,10 +4869,10 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc, v1, v3, vcc
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc
; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 63b5b0f76eaaa..0f60ceb06f2dd 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -139,7 +139,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x1000
; GFX11-NEXT: v_cndmask_b32_e32 v6, v1, v6, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, vcc_lo, 0, v1, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v6
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v4
@@ -297,10 +297,11 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: v_subrev_nc_u32_e32 v5, s6, v4
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
; GFX11-NEXT: global_store_b32 v3, v4, s[8:9]
; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
@@ -703,8 +704,8 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v3, vcc_lo
; GFX11-NEXT: global_store_b16 v4, v2, s[0:1]
; GFX11-NEXT: s_cbranch_scc0 .LBB4_1
; GFX11-NEXT: ; %bb.2: ; %bb2
@@ -815,9 +816,9 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_trunc_f32_e32 v3, v3
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v3, vcc_lo
; GFX11-NEXT: v_mov_b32_e32 v3, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 027576630c877..33e5d1d2ca473 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5310,7 +5310,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX10-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3
; GFX10-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX10-SDAG-NEXT: v_mul_lo_u32 v6, v5, v2
@@ -5318,7 +5318,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v7, v6
; GFX10-SDAG-NEXT: v_mul_lo_u32 v6, v4, v3
; GFX10-SDAG-NEXT: v_mad_u64_u32 v[3:4], null, v4, v2, 0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v5, vcc_lo
; GFX10-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2
; GFX10-SDAG-NEXT: v_add3_u32 v4, v4, v6, v2
; GFX10-SDAG-NEXT: v_mul_lo_u32 v2, v3, v1
@@ -5335,23 +5335,23 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2]
; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, v6
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, v[4:5]
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, 0
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v4, v7, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v4, v7, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v6
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v3, v[1:2]
; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v4, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v4, vcc_lo
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v8, 0
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[6:7]
; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v5, 1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v4
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v7, 0
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v6, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v6, vcc_lo
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v10, v[2:3]
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[1:2]
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v8, v[4:5]
@@ -5362,31 +5362,32 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3
; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mul_lo_u32 v6, v5, v2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v7, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_u32 v6, vcc_lo, v0, v4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v5, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-SDAG-NEXT: v_mul_lo_u32 v7, v6, v3
; GFX11-SDAG-NEXT: v_mad_u64_u32 v[3:4], null, v6, v2, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_mul_lo_u32 v1, v3, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add3_u32 v4, v4, v7, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX11-SDAG-NEXT: v_mad_u64_u32 v[5:6], null, v3, v0, v[3:4]
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add3_u32 v6, v2, v6, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v5, v4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mul_lo_u32 v4, v6, v3
; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v5, v3, v[5:6]
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -5394,18 +5395,19 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v3, v[1:2]
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, v[4:5]
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v7
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v4, v2, 0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v5, v8, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v5, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v5, vcc_lo
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v7
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2]
@@ -5418,7 +5420,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v12, v[2:3]
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v9, 0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v8, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, vcc_lo
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v11, v[4:5]
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[1:2]
@@ -5434,7 +5436,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -5445,7 +5447,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v4, v3
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v2, 0
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
+; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v5, vcc_lo
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2
; GFX1200-SDAG-NEXT: v_add3_u32 v4, v4, v6, v2
@@ -5460,7 +5462,6 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1]
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i64:
@@ -5472,7 +5473,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -5484,11 +5485,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v4, v2
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v0, v5, vcc_lo
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v0, v5, vcc_lo
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v3, v[1:2]
; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v0, vcc_lo
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v0, vcc_lo
; GFX1200-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v7, 1
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v4
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v2, v[3:4]
@@ -5496,14 +5497,13 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v6, v[0:1]
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v5, v8
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v4, v[2:3]
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v5, v3, v[0:1]
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v5, v8
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v8, v[2:3]
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add i64 %x, 1
@@ -6065,9 +6065,9 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
; GFX10-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v3, vcc_lo
; GFX10-SDAG-NEXT: v_mul_lo_u32 v12, v9, v4
; GFX10-SDAG-NEXT: v_mul_lo_u32 v13, v8, v5
; GFX10-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0
@@ -6077,9 +6077,9 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX10-SDAG-NEXT: v_add3_u32 v12, v1, v13, v12
; GFX10-SDAG-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8
; GFX10-SDAG-NEXT: v_add3_u32 v13, v3, v15, v14
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v12, v9, vcc_lo
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v9, vcc_lo
; GFX10-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v2, v10
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v13, v11, vcc_lo
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v9, null, v13, v11, vcc_lo
; GFX10-SDAG-NEXT: v_mul_lo_u32 v10, v3, v4
; GFX10-SDAG-NEXT: v_mul_lo_u32 v11, v1, v5
; GFX10-SDAG-NEXT: v_mul_lo_u32 v7, v8, v7
@@ -6110,30 +6110,30 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v0, 1
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo
; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v2, 1
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v4, 0
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v3, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, vcc_lo
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v14, v6, 0
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v12, v5, v[1:2]
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v14, v7, v[3:4]
; GFX10-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v12
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v13, v4, v[8:9]
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v3, v4, 0
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v10, v13, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, v10, v13, vcc_lo
; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v2, v14
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v6, v[9:10]
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, 0
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, v8, v15, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v8, v15, vcc_lo
; GFX10-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v0, 1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v12
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v10, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v10, vcc_lo
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v11, v19, 0
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v5, v[0:1]
; GFX10-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v17, v7, v[1:2]
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v8, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v8, vcc_lo
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v15, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v10
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v16, v4, v[14:15]
@@ -6141,11 +6141,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v18, v6, v[0:1]
; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v8
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v11, v20, v[1:2]
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v4, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v4, vcc_lo
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v13, v12, v[6:7]
; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v13, 1
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, 0, v5, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v5, vcc_lo
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v4, v19, v[10:11]
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v7, v17, 0
; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v15, v[11:12]
@@ -6160,9 +6160,10 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v3, vcc_lo
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-SDAG-NEXT: v_mul_lo_u32 v12, v9, v4
; GFX11-SDAG-NEXT: v_mul_lo_u32 v13, v8, v5
@@ -6174,10 +6175,10 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v13, v12
; GFX11-SDAG-NEXT: v_add3_u32 v12, v3, v15, v14
; GFX11-SDAG-NEXT: v_add_co_u32 v3, vcc_lo, v0, v8
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v9, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v8, null, v1, v9, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v9, vcc_lo, v2, v10
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v12, v11, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v10, null, v12, v11, vcc_lo
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-SDAG-NEXT: v_mul_lo_u32 v11, v8, v4
; GFX11-SDAG-NEXT: v_mul_lo_u32 v13, v3, v5
@@ -6215,51 +6216,53 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v13, v4, 0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v3, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v3, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v15, v6, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v5, v[1:2]
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v15, v7, v[3:4]
; GFX11-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v13
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v14, v4, v[8:9]
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v16, v6, v[9:10]
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v10, v14, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, v10, v14, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v15
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v3, v4, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, v11, v16, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v11, v16, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v18, v6, 0
; GFX11-GISEL-NEXT: v_add_co_u32 v20, vcc_lo, v0, 1
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0, v10, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v10, vcc_lo
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v9
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v8, v20, 0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v13
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v5, v[0:1]
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v10
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v7, v[1:2]
; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, 1
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v11, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v11, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v17, v4, v[13:14]
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v12, v18, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v19, v6, v[14:15]
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v21, v[0:1]
; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v15, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v15, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v12, 1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v12, v22, v[5:6]
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v17, 0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, 0, v10, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v10, vcc_lo
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v15, v20, v[6:7]
; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v10, v18, v[7:8]
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -6281,10 +6284,10 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1200-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
; GFX1200-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo
+; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v3, vcc_lo
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v12, v9, v4
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v13, v8, v5
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
@@ -6297,10 +6300,10 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1200-SDAG-NEXT: v_add3_u32 v13, v3, v15, v14
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v12, v9, vcc_lo
+; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v9, vcc_lo
; GFX1200-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v2, v10
; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v13, v11, vcc_lo
+; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e64 v9, null, v13, v11, vcc_lo
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v10, v3, v4
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v11, v1, v5
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v8, v7
@@ -6332,7 +6335,6 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v9, v1, v4
; GFX1200-SDAG-NEXT: v_add3_u32 v3, v10, v3, v6
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i64:
@@ -6345,11 +6347,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1
; GFX1200-GISEL-NEXT: v_add_co_u32 v9, s0, v2, 1
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v1, vcc_lo
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v1, vcc_lo
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v8, v4
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v9, v6
; GFX1200-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v11, vcc_lo, 0, v3, s0
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v3, s0
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v12, v8, v4
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v13, v9, v6
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -6364,30 +6366,29 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v14, v4
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v3, v15, v6
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v1, v10, vcc_lo
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, v1, v10, vcc_lo
; GFX1200-GISEL-NEXT: s_wait_alu 0xf1ff
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v11, vcc_lo, v2, v11, s0
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v11, null, v2, v11, s0
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[8:9], null, v14, v5, v[0:1]
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[9:10], null, v15, v7, v[3:4]
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v10, v15, v6
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v7, v14, v4
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v16, v4, v[8:9]
; GFX1200-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v12, 1
-; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[4:5], null, v11, v6, v[9:10]
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX1200-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v8
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v10, v9
-; GFX1200-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v7, 1
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
+; GFX1200-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v7, 1
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v3, vcc_lo
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v3, vcc_lo
; GFX1200-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v10, 1
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v11, v7, v8
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[5:6], null, v7, v5, v[0:1]
@@ -6395,7 +6396,7 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v2, v14, v15
; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v4, vcc_lo
+; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v4, vcc_lo
; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v11, v12
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[5:6], null, v3, v8, v[5:6]
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v9, v[1:2]
@@ -6407,7 +6408,6 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v12, v[6:7]
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v14, v15
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v15, v[7:8]
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i64> %x, <i64 1, i64 1>
@@ -9888,7 +9888,7 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
; GFX10-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v1
; GFX10-GISEL-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: mul_u24_add64:
@@ -9904,9 +9904,9 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v1
; GFX11-GISEL-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-LABEL: mul_u24_add64:
@@ -9953,16 +9953,16 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: mul_u24_zext_add64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mul_u32_u24_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-LABEL: mul_u24_zext_add64:
@@ -9976,8 +9976,7 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX1200-NEXT: s_wait_alu 0xfffd
-; GFX1200-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
-; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y)
%mul.zext = zext i32 %mul to i64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index b978b13c71c3e..4509d954c5e8b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -388,12 +388,12 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1]
; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3]
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
@@ -536,12 +536,12 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1]
; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3]
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
@@ -704,9 +704,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1]
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
@@ -854,9 +854,9 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1]
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 0dcfb840dec06..96da9b91a3a92 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -105,8 +105,8 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; VARIANT4-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; VARIANT4-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
-; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; VARIANT4-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; VARIANT4-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
; VARIANT4-NEXT: global_load_b32 v0, v[0:1], off
; VARIANT4-NEXT: s_wait_loadcnt 0x0
; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -127,8 +127,8 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; VARIANT5-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; VARIANT5-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
-; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; VARIANT5-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; VARIANT5-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo
; VARIANT5-NEXT: global_load_b32 v0, v[0:1], off
; VARIANT5-NEXT: s_wait_loadcnt 0x0
; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1]
@@ -151,8 +151,8 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; VARIANT6-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
; VARIANT6-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; VARIANT6-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; VARIANT6-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; VARIANT6-NEXT: global_load_b32 v0, v[0:1], off
; VARIANT6-NEXT: s_wait_loadcnt 0x0
; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll
index 01df8efbb2e25..a8bf9ef350fea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.prefetch.data.ll
@@ -67,9 +67,10 @@ define amdgpu_ps void @prefetch_data_vgpr_imm_base_sgpr_len(ptr addrspace(4) %pt
; GISEL-LABEL: prefetch_data_vgpr_imm_base_sgpr_len:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x200, v0
-; GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GISEL-NEXT: s_prefetch_data s[2:3], 0x0, s0, 0
; GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index ce79201fb8098..21b9e55e630d5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -78,18 +78,18 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v5, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v5, v3, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v1, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add3_u32 v1, v1, v6, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v9, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -109,7 +109,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v3, 0
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v1, v5
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
; GFX12-NEXT: v_add3_u32 v1, v1, v5, v7
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7
; GFX12-NEXT: s_wait_alu 0xfffd
@@ -119,7 +119,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
@@ -242,33 +242,33 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v5, v2, 0
; GFX11-NEXT: v_mad_i64_i32 v[10:11], null, v5, v3, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v1, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add3_u32 v1, v1, v6, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10
-; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2
-; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v10, null, 0, v9, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
-; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_subrev_co_ci_u32_e64 v7, null, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -289,7 +289,7 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v1, v6
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8
; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8
; GFX12-NEXT: s_wait_alu 0xfffd
@@ -299,10 +299,10 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v10, null, 0, v9, vcc_lo
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
@@ -311,7 +311,7 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
+; GFX12-NEXT: v_subrev_co_ci_u32_e64 v7, null, 0, v5, vcc_lo
; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_wait_alu 0xfffd
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
index 006da0cd18867..13ea8b08d2ade 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
@@ -9,8 +9,8 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v3
@@ -30,8 +30,8 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/lrint.ll b/llvm/test/CodeGen/AMDGPU/lrint.ll
index 31e6cf6ea645c..58f782fd4ecdd 100644
--- a/llvm/test/CodeGen/AMDGPU/lrint.ll
+++ b/llvm/test/CodeGen/AMDGPU/lrint.ll
@@ -170,9 +170,9 @@ define i64 @intrinsic_lrint_i64_f32(float %arg) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-SDAG-NEXT: v_xor_b32_e32 v0, v0, v3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: intrinsic_lrint_i64_f32:
@@ -193,8 +193,8 @@ define i64 @intrinsic_lrint_i64_f32(float %arg) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%res = tail call i64 @llvm.lrint.i64.f32(float %arg)
@@ -373,9 +373,9 @@ define i64 @intrinsic_llrint_i64_f32(float %arg) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-SDAG-NEXT: v_xor_b32_e32 v0, v0, v3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: intrinsic_llrint_i64_f32:
@@ -396,8 +396,8 @@ define i64 @intrinsic_llrint_i64_f32(float %arg) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%res = tail call i64 @llvm.llrint.i64.f32(float %arg)
@@ -720,11 +720,12 @@ define <2 x i64> @intrinsic_lrint_v2i64_v2f32(<2 x float> %arg) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX11-SDAG-NEXT: v_xor_b32_e32 v4, v0, v6
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5
-; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v2, v5, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
-; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: intrinsic_lrint_v2i64_v2f32:
@@ -757,12 +758,12 @@ define <2 x i64> @intrinsic_lrint_v2i64_v2f32(<2 x float> %arg) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v5, v0, v3
; GFX11-GISEL-NEXT: v_xor_b32_e32 v4, v4, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v6
-; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v2, v6, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v5, v3
-; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v4, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%res = tail call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index 072ee70b840d8..7911631483931 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -303,9 +303,9 @@ define i64 @intrinsic_lround_i64_f32(float %arg) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-SDAG-NEXT: v_xor_b32_e32 v0, v0, v3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: intrinsic_lround_i64_f32:
@@ -332,9 +332,9 @@ define i64 @intrinsic_lround_i64_f32(float %arg) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, v0, v3
; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%res = tail call i64 @llvm.lround.i64.f32(float %arg)
@@ -594,9 +594,9 @@ define i64 @intrinsic_llround_i64_f32(float %arg) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-SDAG-NEXT: v_xor_b32_e32 v0, v0, v3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: intrinsic_llround_i64_f32:
@@ -623,9 +623,9 @@ define i64 @intrinsic_llround_i64_f32(float %arg) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, v0, v3
; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%res = tail call i64 @llvm.llround.i64.f32(float %arg)
@@ -1278,11 +1278,12 @@ define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-SDAG-NEXT: v_xor_b32_e32 v3, v3, v6
; GFX11-SDAG-NEXT: v_xor_b32_e32 v4, v0, v6
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5
-; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v2, v5, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
-; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: intrinsic_lround_v2i64_v2f32:
@@ -1328,10 +1329,11 @@ define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v5, v0, v3
; GFX11-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v6
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v2, v6, vcc_lo
; GFX11-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v5, v3
-; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v4, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%res = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 4fb6a0114b499..4a6b2ebd3d203 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -392,6 +392,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41
+; CHECK-NEXT: v_add_co_ci_u32_e64 v0, null, 0, v1, vcc_lo
; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41
; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52
@@ -410,9 +411,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_add_co_u32 v2, s4, s64, v1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s65, 0, s4
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:8
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:8
@@ -480,7 +481,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_lshrrev_b64 v[0:1], 16, v[45:46]
; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, v10
; CHECK-NEXT: v_or3_b32 v8, v8, v9, v62
-; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
; CHECK-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5]
; CHECK-NEXT: v_or_b32_e32 v1, v11, v1
; CHECK-NEXT: ; implicit-def: $vgpr42
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index b539f8a9aafb8..c5c95380fde9b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -318,8 +318,8 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[9:10]
; GFX1100-NEXT: v_add_co_u32 v7, vcc_lo, v0, v12
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v13, vcc_lo
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1100-NEXT: v_add_co_ci_u32_e64 v8, null, v1, v13, vcc_lo
; GFX1100-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
; GFX1100-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v11, v3, vcc_lo
; GFX1100-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v4, vcc_lo
@@ -346,9 +346,9 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_co_ci_u32_e64 v9, null, 0, 0, s0
; GFX1150-NEXT: v_mad_u64_u32 v[8:9], null, v13, v14, v[8:9]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0
-; GFX1150-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo
+; GFX1150-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v1, vcc_lo
; GFX1150-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
; GFX1150-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -382,7 +382,7 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v13, v14, v[8:9]
; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v1, vcc_lo
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
@@ -1115,13 +1115,14 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_mad_i64_i32 v[8:9], null, v0, v1, 0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2
-; GFX1100-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo
+; GFX1100-NEXT: v_add_co_ci_u32_e64 v1, null, v9, v3, vcc_lo
; GFX1100-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4
-; GFX1100-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v5, vcc_lo
; GFX1100-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6
-; GFX1100-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo
+; GFX1100-NEXT: v_add_co_ci_u32_e64 v5, null, v9, v7, vcc_lo
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1100-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX1100-NEXT: v_xor_b32_e32 v1, v1, v3
@@ -1134,13 +1135,14 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v1, 0
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX1150-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX1150-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
; GFX1150-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4
-; GFX1150-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v5, vcc_lo
; GFX1150-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
-; GFX1150-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
+; GFX1150-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, vcc_lo
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1150-NEXT: v_xor_b32_e32 v2, v2, v4
; GFX1150-NEXT: v_xor_b32_e32 v3, v3, v5
@@ -1160,19 +1162,18 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v5, vcc_lo
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, vcc_lo
; GFX12-NEXT: v_xor_b32_e32 v2, v2, v4
; GFX12-NEXT: v_xor_b32_e32 v3, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -1220,9 +1221,9 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_mad_i64_i32 v[4:5], null, v0, v1, 0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
-; GFX1100-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
+; GFX1100-NEXT: v_add_co_ci_u32_e64 v1, null, v5, v3, vcc_lo
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100-NEXT: v_xor_b32_e32 v0, v0, v4
; GFX1100-NEXT: v_xor_b32_e32 v1, v1, v5
@@ -1232,9 +1233,9 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v1, 0
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX1150-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX1150-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1150-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX1150-NEXT: v_xor_b32_e32 v1, v3, v1
@@ -1251,10 +1252,9 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -1777,11 +1777,11 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: lshr_mad_i64_negative_3:
@@ -1796,11 +1796,10 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = add i64 %arg0, 1
%lsh = lshr i64 %arg0, 32
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 401724443567a..b3b529d4e5e5b 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -44,9 +44,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v5, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v5, null, s3, v5, vcc_lo
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_not_b32_e32 v3, v3
@@ -72,11 +72,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s3, v5, vcc_lo
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_not_b32_e32 v3, v3
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 565fce0e7abde..0f1c1cf0d80af 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -17,7 +17,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; CHECK-NEXT: .LBB0_1: ; %load-store-loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 0x100
; CHECK-NEXT: s_clause 0xf
@@ -37,7 +37,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48
; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97]
; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224
@@ -93,7 +93,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; ALIGNED-NEXT: .LBB0_1: ; %load-store-loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240
; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224
@@ -122,7 +122,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68
; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64
; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
@@ -734,14 +734,14 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
; UNROLL3-NEXT: .LBB0_1: ; %load-store-loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
; UNROLL3-NEXT: s_add_u32 s4, s4, 48
; UNROLL3-NEXT: s_clause 0x2
; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16
; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
@@ -774,7 +774,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; CHECK-NEXT: .LBB1_1: ; %load-store-loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 0x100
; CHECK-NEXT: s_clause 0xf
@@ -794,7 +794,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48
; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off
; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: s_waitcnt vmcnt(15)
; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224
@@ -849,7 +849,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; ALIGNED-NEXT: .LBB1_1: ; %load-store-loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[24:25], off offset:240
; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[24:25], off offset:224
@@ -878,7 +878,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68
; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64
; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
@@ -1488,14 +1488,14 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
; UNROLL3-NEXT: .LBB1_1: ; %load-store-loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
; UNROLL3-NEXT: s_add_u32 s4, s4, 48
; UNROLL3-NEXT: s_clause 0x2
; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off
; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16
; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off
@@ -1527,7 +1527,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; CHECK-NEXT: .LBB2_1: ; %load-store-loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 0x100
; CHECK-NEXT: s_clause 0xf
@@ -1547,7 +1547,7 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32
; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16
; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: s_waitcnt vmcnt(15)
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240
@@ -1595,9 +1595,9 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; ALIGNED-NEXT: .LBB2_1: ; %load-store-loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo
; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240
; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224
@@ -2095,14 +2095,14 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
; UNROLL3-NEXT: .LBB2_1: ; %load-store-loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
; UNROLL3-NEXT: s_add_u32 s4, s4, 48
; UNROLL3-NEXT: s_clause 0x2
; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16
; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off
; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16
@@ -3649,7 +3649,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 0x100
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
@@ -4589,7 +4589,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v0, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v0, vcc_lo
; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:250
; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251
; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:249
@@ -5375,7 +5375,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
; UNROLL3-NEXT: buffer_load_dword v15, v3, s[0:3], 0 offen offset:44
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
; UNROLL3-NEXT: s_add_u32 s4, s4, 48
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3
; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
@@ -5425,9 +5425,9 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240
@@ -5491,9 +5491,9 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: .LBB5_5: ; %memmove_bwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240
@@ -5572,7 +5572,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[20:21] offset:240
; ALIGNED-NEXT: flat_load_dwordx4 v[22:25], v[20:21] offset:224
@@ -5601,7 +5601,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68
; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64
; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
@@ -6201,7 +6201,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: .LBB5_5: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240
; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224
@@ -6230,7 +6230,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324
; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320
; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
@@ -6847,9 +6847,9 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; UNROLL3-NEXT: .LBB5_2: ; %memmove_fwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16
@@ -6890,9 +6890,9 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; UNROLL3-NEXT: .LBB5_6: ; %memmove_bwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16
@@ -6929,9 +6929,9 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; CHECK-NEXT: .LBB6_2: ; %memmove_fwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240
@@ -6995,9 +6995,9 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; CHECK-NEXT: .LBB6_5: ; %memmove_bwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240
@@ -7075,7 +7075,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: .LBB6_2: ; %memmove_fwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[20:21], off offset:240
; ALIGNED-NEXT: global_load_dwordx4 v[22:25], v[20:21], off offset:224
@@ -7104,7 +7104,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68
; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64
; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
@@ -7702,7 +7702,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: .LBB6_5: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[24:25], off offset:240
; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[24:25], off offset:224
@@ -7731,7 +7731,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324
; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320
; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
@@ -8346,9 +8346,9 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; UNROLL3-NEXT: .LBB6_2: ; %memmove_fwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off
; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16
@@ -8389,9 +8389,9 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; UNROLL3-NEXT: .LBB6_6: ; %memmove_bwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off
; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16
@@ -8427,9 +8427,9 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; CHECK-NEXT: .LBB7_2: ; %memmove_fwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224
@@ -8493,9 +8493,9 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; CHECK-NEXT: .LBB7_5: ; %memmove_bwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224
@@ -8566,9 +8566,9 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: .LBB7_2: ; %memmove_fwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo
; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:240
; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224
@@ -9064,9 +9064,9 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: .LBB7_5: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo
; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v97, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240
; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224
@@ -9569,9 +9569,9 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; UNROLL3-NEXT: .LBB7_2: ; %memmove_fwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16
; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off
@@ -9614,9 +9614,9 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; UNROLL3-NEXT: .LBB7_6: ; %memmove_bwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16
; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off
@@ -12526,7 +12526,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 0x100
; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
; CHECK-NEXT: s_addc_u32 s5, s5, 0
@@ -12630,7 +12630,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2
; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
; CHECK-NEXT: s_addc_u32 s5, s5, -1
@@ -13578,7 +13578,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v4, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v4, vcc_lo
; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:250
; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251
; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:249
@@ -15147,7 +15147,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v3, vcc_lo
; ALIGNED-NEXT: flat_store_byte v[2:3], v1 offset:250
; ALIGNED-NEXT: flat_store_byte v[2:3], v7 offset:251
; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:249
@@ -15941,7 +15941,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; UNROLL3-NEXT: buffer_load_dword v14, v3, s[0:3], 0 offen offset:40
; UNROLL3-NEXT: buffer_load_dword v15, v3, s[0:3], 0 offen offset:44
; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_add_u32 s4, s4, 48
; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3
; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
@@ -16010,7 +16010,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:40
; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:44
; UNROLL3-NEXT: v_add_co_u32 v15, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v16, null, s5, v1, vcc_lo
; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2
; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0
; UNROLL3-NEXT: s_addc_u32 s5, s5, -1
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index bc8bcc622810f..272daa9dd0b59 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -43,15 +43,15 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5]
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17]
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
; CHECK-NEXT: .LBB0_6: ; %Flow30
@@ -60,24 +60,24 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB0_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v4, v[2:3]
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB0_8
; CHECK-NEXT: .LBB0_9: ; %Flow28
@@ -94,26 +94,26 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB0_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v12, v[10:11]
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[4:5], v12
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB0_12
; CHECK-NEXT: .LBB0_13: ; %Flow34
@@ -122,20 +122,20 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB0_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
@@ -191,15 +191,15 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17]
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB1_5
; CHECK-NEXT: .LBB1_6: ; %Flow32
@@ -208,24 +208,24 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_cbranch_execz .LBB1_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB1_8
; CHECK-NEXT: .LBB1_9: ; %Flow30
@@ -242,26 +242,26 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_cbranch_execz .LBB1_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v2, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v3, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v12, v[4:5], off
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[10:11], v12
; CHECK-NEXT: v_add_co_u32 v10, s5, v10, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, -1, v11, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB1_12
; CHECK-NEXT: .LBB1_13: ; %Flow36
@@ -270,20 +270,20 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_cbranch_execz .LBB1_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
@@ -340,14 +340,14 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[13:16], v4
; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v12, s5
; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16]
; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB2_5
; CHECK-NEXT: .LBB2_6: ; %Flow34
@@ -357,7 +357,7 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v6, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
@@ -365,14 +365,14 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v3, v2
; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB2_8
; CHECK-NEXT: .LBB2_9: ; %Flow32
@@ -389,24 +389,24 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_cbranch_execz .LBB2_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v1, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB2_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v11, v4
; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, s4
; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[9:10], v11
; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, -1, v10, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB2_12
; CHECK-NEXT: .LBB2_13: ; %Flow38
@@ -416,7 +416,7 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
@@ -424,9 +424,9 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v6, vcc_lo
; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5
-; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, v1, v6, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4]
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
@@ -483,15 +483,15 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17]
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB3_5
; CHECK-NEXT: .LBB3_6: ; %Flow29
@@ -500,24 +500,24 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_cbranch_execz .LBB3_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB3_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v4
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB3_8
; CHECK-NEXT: .LBB3_9: ; %Flow27
@@ -534,26 +534,26 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_cbranch_execz .LBB3_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB3_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v12, v[10:11], off
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[4:5], v12
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB3_12
; CHECK-NEXT: .LBB3_13: ; %Flow33
@@ -562,20 +562,20 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_cbranch_execz .LBB3_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB3_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
@@ -636,14 +636,14 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: buffer_load_dword v15, v4, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v16, v4, s[0:3], 0 offen offset:12
; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v12, s5
; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16]
; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB4_5
; CHECK-NEXT: .LBB4_6: ; %Flow34
@@ -653,7 +653,7 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v6, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
@@ -661,14 +661,14 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB4_8
; CHECK-NEXT: .LBB4_9: ; %Flow32
@@ -685,24 +685,24 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: s_cbranch_execz .LBB4_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v1, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB4_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, s4
; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_byte v[9:10], v11
; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, -1, v10, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB4_12
; CHECK-NEXT: .LBB4_13: ; %Flow38
@@ -712,7 +712,7 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
@@ -724,9 +724,9 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v6, vcc_lo
; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5
-; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, v1, v6, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4]
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
@@ -783,15 +783,15 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5]
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB5_5
; CHECK-NEXT: .LBB5_6: ; %Flow32
@@ -800,24 +800,24 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB5_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB5_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v4, v[2:3]
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_byte v[0:1], v4, off
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB5_8
; CHECK-NEXT: .LBB5_9: ; %Flow30
@@ -834,26 +834,26 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB5_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB5_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v12, v[10:11]
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: global_store_byte v[4:5], v12, off
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB5_12
; CHECK-NEXT: .LBB5_13: ; %Flow36
@@ -862,20 +862,20 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB5_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB5_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
@@ -929,15 +929,15 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB6_5
; CHECK-NEXT: .LBB6_6: ; %Flow36
@@ -946,24 +946,24 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_cbranch_execz .LBB6_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB6_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_byte v[0:1], v4, off
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB6_8
; CHECK-NEXT: .LBB6_9: ; %Flow34
@@ -980,26 +980,26 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_cbranch_execz .LBB6_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB6_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v12, v[10:11], off
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_byte v[4:5], v12, off
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB6_12
; CHECK-NEXT: .LBB6_13: ; %Flow40
@@ -1008,20 +1008,20 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_cbranch_execz .LBB6_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
@@ -1059,7 +1059,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: ds_read_b128 v[10:13], v9
; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v15, null, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
@@ -1080,13 +1080,13 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; CHECK-NEXT: .LBB7_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v7, v2
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 1
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
@@ -1141,15 +1141,15 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off
; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB8_5
; CHECK-NEXT: .LBB8_6: ; %Flow33
@@ -1158,24 +1158,24 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_cbranch_execz .LBB8_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB8_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5
; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_byte v[0:1], v4, off
; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB8_8
; CHECK-NEXT: .LBB8_9: ; %Flow31
@@ -1192,26 +1192,26 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_cbranch_execz .LBB8_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB8_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: global_load_ubyte v12, v[10:11], off
; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4
; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
; CHECK-NEXT: s_or_b32 s8, s4, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_byte v[4:5], v12, off
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB8_12
; CHECK-NEXT: .LBB8_13: ; %Flow37
@@ -1220,20 +1220,20 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_cbranch_execz .LBB8_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo
; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6
-; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
; CHECK-NEXT: v_mov_b32_e32 v7, v5
; CHECK-NEXT: v_mov_b32_e32 v6, v4
@@ -1276,7 +1276,7 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: buffer_load_dword v13, v9, s[0:3], 0 offen offset:12
; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v15, null, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
@@ -1297,13 +1297,13 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; CHECK-NEXT: .LBB9_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4
; CHECK-NEXT: s_add_u32 s4, s4, 1
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
@@ -1363,9 +1363,9 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10]
; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v12, s5
; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s6, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1382,15 +1382,15 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v2, v8, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v8, s5
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB10_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v2, v[0:1]
; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s5
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, 0, v1, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6]
; CHECK-NEXT: s_or_b32 s9, s6, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1412,19 +1412,19 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB10_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v2, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB10_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v11, v[9:10]
; CHECK-NEXT: v_add_co_u32 v5, s4, v5, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s4, -1, v6, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s4
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6]
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1439,15 +1439,16 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, -1, v2, vcc_lo
; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16
; CHECK-NEXT: s_mov_b32 s5, 0
+; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v2, v8, vcc_lo
; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, vcc_lo
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4]
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
@@ -1484,7 +1485,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: .LBB11_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
@@ -1507,11 +1508,11 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
-; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo
; CHECK-NEXT: .LBB11_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
@@ -1565,7 +1566,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[9:12], v3
; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5
; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7]
; CHECK-NEXT: s_or_b32 s8, s5, s8
@@ -1587,7 +1588,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v2, v1
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5
; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5]
; CHECK-NEXT: s_or_b32 s8, s5, s8
@@ -1616,7 +1617,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_u8 v8, v7
; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s4
; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v7
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[4:5]
; CHECK-NEXT: s_or_b32 s7, s4, s7
@@ -1636,12 +1637,12 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4
; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5
; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4
-; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ds_read_b128 v[5:8], v4
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
@@ -1678,7 +1679,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: .LBB13_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
@@ -1701,11 +1702,11 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
-; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo
; CHECK-NEXT: .LBB13_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
@@ -1837,9 +1838,9 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10]
; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v12, s5
; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v16, v4, s[0:3], 0 offen offset:12
@@ -1859,15 +1860,15 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v2, v8, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v8, s5
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB15_8: ; %memmove_fwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v2, v[0:1]
; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s5
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, 0, v1, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6]
; CHECK-NEXT: s_or_b32 s9, s6, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1889,19 +1890,19 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_cbranch_execz .LBB15_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v2, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB15_12: ; %memmove_bwd_residual_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: flat_load_ubyte v11, v[9:10]
; CHECK-NEXT: v_add_co_u32 v5, s4, v5, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s4, -1, v6, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s4
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6]
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1916,16 +1917,16 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, -1, v2, vcc_lo
; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB15_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v2, v8, vcc_lo
; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, vcc_lo
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
@@ -1965,7 +1966,7 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: .LBB16_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
@@ -1991,11 +1992,11 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
-; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo
; CHECK-NEXT: .LBB16_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
@@ -2103,7 +2104,7 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: .LBB18_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
@@ -2129,11 +2130,11 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
-; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo
; CHECK-NEXT: .LBB18_5: ; %loop-memcpy-residual
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4
-; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo
; CHECK-NEXT: s_add_u32 s4, s4, 1
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
@@ -2190,7 +2191,7 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16
-; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5
; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen offset:12
@@ -2218,7 +2219,7 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5
; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5]
; CHECK-NEXT: s_or_b32 s8, s5, s8
@@ -2247,7 +2248,7 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: buffer_load_ubyte v8, v7, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s4
; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v7
; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[4:5]
; CHECK-NEXT: s_or_b32 s7, s4, s7
@@ -2267,7 +2268,7 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4
; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5
; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4
-; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; CHECK-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB19_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2277,7 +2278,7 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: buffer_load_dword v6, v4, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v7, v4, s[0:3], 0 offen
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index f5bbf8a02e980..7dd45181a8356 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -3117,8 +3117,8 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add3_u32 v0, v12, v11, v4
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, v7, v0, vcc_lo
; GFX11-NEXT: global_store_b128 v17, v[8:11], s[2:3]
; GFX11-NEXT: s_endpgm
;
@@ -3154,8 +3154,8 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add3_u32 v3, v7, v3, v4
; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v3, vcc_lo
; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3]
; GFX12-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 1ecf8f228c625..bf450ab6e80c4 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -151,7 +151,8 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -180,7 +181,8 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -212,7 +214,8 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -241,7 +244,8 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fffff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -273,7 +277,8 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -316,7 +321,8 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -359,7 +365,8 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -402,7 +409,8 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -485,7 +493,8 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -514,7 +523,8 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -546,7 +556,8 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -575,7 +586,8 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x3fff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -607,7 +619,8 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4094
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -621,10 +634,9 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388606
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
@@ -640,7 +652,8 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -654,10 +667,9 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 16777214
%load = load i8, ptr %gep, align 4
@@ -687,7 +699,8 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -730,7 +743,8 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -773,7 +787,8 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -816,7 +831,8 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -830,10 +846,9 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
@@ -845,10 +860,9 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -16777215
%load = load i8, ptr %gep, align 4
@@ -879,7 +893,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -893,10 +908,9 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
@@ -912,7 +926,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -926,10 +941,9 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589936639
%load = load i8, ptr %gep, align 4
@@ -960,7 +974,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -974,10 +989,9 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
@@ -993,7 +1007,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1007,10 +1022,9 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589936640
%load = load i8, ptr %gep, align 4
@@ -1041,7 +1055,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1055,10 +1070,9 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
@@ -1074,7 +1088,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1088,10 +1103,9 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589938687
%load = load i8, ptr %gep, align 4
@@ -1122,7 +1136,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1136,10 +1151,9 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
@@ -1151,10 +1165,9 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589938688
%load = load i8, ptr %gep, align 4
@@ -1185,7 +1198,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1199,10 +1213,9 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
@@ -1218,7 +1231,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1232,10 +1246,9 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589942783
%load = load i8, ptr %gep, align 4
@@ -1266,7 +1279,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1280,10 +1294,9 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
@@ -1295,10 +1308,9 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589942784
%load = load i8, ptr %gep, align 4
@@ -1330,7 +1342,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1344,10 +1357,9 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
@@ -1369,10 +1381,9 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854773761
%load = load i8, ptr %gep, align 4
@@ -1404,7 +1415,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1418,10 +1430,9 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
@@ -1443,10 +1454,9 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854773760
%load = load i8, ptr %gep, align 4
@@ -1478,7 +1488,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1492,10 +1503,9 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
@@ -1517,10 +1527,9 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854771713
%load = load i8, ptr %gep, align 4
@@ -1552,7 +1561,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1566,10 +1576,9 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
@@ -1591,10 +1600,9 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854771712
%load = load i8, ptr %gep, align 4
@@ -1626,7 +1634,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1640,10 +1649,9 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
@@ -1665,10 +1673,9 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854767617
%load = load i8, ptr %gep, align 4
@@ -1700,7 +1707,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1714,10 +1722,9 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
@@ -1739,10 +1746,9 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854767616
%load = load i8, ptr %gep, align 4
@@ -3277,8 +3283,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3290,8 +3296,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3376,8 +3382,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3389,8 +3395,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3475,8 +3481,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3488,8 +3494,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3574,8 +3580,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3587,8 +3593,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3673,8 +3679,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3686,8 +3692,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3772,8 +3778,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3785,8 +3791,8 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 43ed3c7eb87ad..e426bc73af66d 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -156,7 +156,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -194,7 +195,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -226,7 +228,8 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fffff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -264,7 +267,8 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -372,7 +376,8 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -415,7 +420,8 @@ define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -506,7 +512,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -544,7 +551,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -576,7 +584,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x3fff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -614,7 +623,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -646,7 +656,8 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -660,10 +671,9 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffffe, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
@@ -688,7 +698,8 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4094
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -702,10 +713,9 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8388606
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -774,7 +784,8 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -817,7 +828,8 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -860,7 +872,8 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -874,10 +887,9 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
@@ -902,7 +914,8 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff001000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -916,10 +929,9 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -951,7 +963,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -965,10 +978,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
@@ -993,7 +1005,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1007,10 +1020,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1041,7 +1053,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1055,10 +1068,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
@@ -1074,7 +1086,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1088,10 +1101,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1122,7 +1134,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1136,10 +1149,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
@@ -1164,7 +1176,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1178,10 +1191,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1212,7 +1224,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1226,10 +1239,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
@@ -1241,10 +1253,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1275,7 +1286,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1289,10 +1301,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
@@ -1317,7 +1328,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1331,10 +1343,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1365,7 +1376,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1379,10 +1391,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
@@ -1394,10 +1405,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1429,7 +1439,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1443,10 +1454,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
@@ -1472,7 +1482,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-2049
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1486,10 +1497,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386561
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1521,7 +1531,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1535,10 +1546,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
@@ -1555,7 +1565,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1569,10 +1580,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386560
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1604,7 +1614,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1618,10 +1629,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
@@ -1647,7 +1657,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-1
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1661,10 +1672,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384513
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1696,7 +1706,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1710,10 +1721,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
@@ -1735,10 +1745,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384512
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1770,7 +1779,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1784,10 +1794,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
@@ -1813,7 +1822,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-1
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1827,10 +1837,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380417
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1862,7 +1871,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1876,10 +1886,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
@@ -1901,10 +1910,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380416
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
%load = load i8, ptr addrspace(1) %gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index afb0ab958954c..35b55a0addd95 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -250,55 +250,55 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2048
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0x2000
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[10:11], v[6:7], off offset:-4096
; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off offset:2048
; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0x2000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: global_load_b64 v[6:7], v[6:7], off
+; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:2048
; GFX11-NEXT: global_load_b64 v[14:15], v[0:1], off
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v7, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
@@ -830,9 +830,9 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s34
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s35, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB1_2 Depth 2
@@ -842,26 +842,28 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: .LBB1_2: ; %for.body
; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v4, 0xffffc000
-; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, 0xffffc000, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, 0xffffd000, v4
-; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v5, vcc_lo
; GFX11-NEXT: global_load_b64 v[13:14], v[7:8], off offset:-4096
+; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, 0xffffd000, v4
; GFX11-NEXT: global_load_b64 v[9:10], v[9:10], off offset:-2048
-; GFX11-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, v4, 0xffffe000
-; GFX11-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v16, null, -1, v5, vcc_lo
; GFX11-NEXT: global_load_b64 v[11:12], v[11:12], off offset:-2048
; GFX11-NEXT: v_add_co_u32 v17, vcc_lo, 0xffffe000, v4
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[19:20], v[15:16], off offset:-4096
; GFX11-NEXT: global_load_b64 v[7:8], v[7:8], off
-; GFX11-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, -1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, -1, v5, vcc_lo
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: global_load_b64 v[17:18], v[17:18], off offset:-2048
; GFX11-NEXT: global_load_b64 v[15:16], v[15:16], off
@@ -870,47 +872,48 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: global_load_b64 v[25:26], v[4:5], off offset:-2048
; GFX11-NEXT: global_load_b64 v[27:28], v[4:5], off
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
; GFX11-NEXT: s_addk_i32 s2, 0x2000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_cmp_gt_u32 s2, 0x3fffff
; GFX11-NEXT: s_waitcnt vmcnt(10)
; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v14, v3, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v10, v3, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v10, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_add_co_u32 v2, s0, v7, v2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v8, v3, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v8, v3, s0
; GFX11-NEXT: v_add_co_u32 v2, s0, v11, v2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v12, v3, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v3, s0
; GFX11-NEXT: v_add_co_u32 v2, s0, v19, v2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v20, v3, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v20, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_add_co_u32 v2, s0, v17, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v18, v3, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v18, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_add_co_u32 v2, s0, v15, v2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v16, v3, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v16, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_add_co_u32 v2, s0, v21, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v22, v3, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v22, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_add_co_u32 v2, s0, v23, v2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v24, v3, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v24, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_add_co_u32 v2, s0, v25, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v26, v3, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v26, v3, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v27, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v28, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v28, v3, vcc_lo
; GFX11-NEXT: s_cbranch_scc0 .LBB1_2
; GFX11-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX11-NEXT: ; in Loop: Header=BB1_1 Depth=1
@@ -1252,16 +1255,17 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v7, v[0:1], off
; GFX11-NEXT: global_load_b32 v8, v[0:1], off offset:1024
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: global_load_b32 v9, v[0:1], off offset:2048
; GFX11-NEXT: global_load_b32 v10, v[0:1], off offset:3072
@@ -1270,7 +1274,8 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX11-NEXT: global_load_b32 v13, v[2:3], off offset:2048
; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:3072
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v3, v[4:5], off
; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:1024
@@ -1508,12 +1513,12 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v8
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_load_b64 v[4:5], v[2:3], off
; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
@@ -1522,15 +1527,15 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v7, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
@@ -1721,14 +1726,15 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x80000000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:2048
@@ -1970,16 +1976,16 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s37, 0, s0
; GFX11-NEXT: v_add_co_u32 v8, s0, s38, v12
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, s39, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0x2000
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, s39, 0, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x2000, v8
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v9, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x3000, v8
; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:-4096
-; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off offset:2048
@@ -1988,19 +1994,21 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v10, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v11, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v11, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v12, v[0:1], s[36:37]
; GFX11-NEXT: s_endpgm
ptr addrspace(1) %buffer2) {
@@ -2285,20 +2293,21 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x3000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0x2000, v0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
; GFX11-NEXT: global_load_b64 v[6:7], v[2:3], off offset:2048
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v1, vcc_lo
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:2048
; GFX11-NEXT: global_load_b64 v[14:15], v[10:11], off
@@ -2307,30 +2316,28 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v7, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(5)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(4)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v13, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v11, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v15, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
@@ -2516,19 +2523,20 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v4
; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:-2048
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35]
; GFX11-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index a995187390806..d89e57245e8ea 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -1192,7 +1192,7 @@ define i64 @v_mul_sub_1_i64(i64 %x, i64 %y) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, -1
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX10-NEXT: v_mul_lo_u32 v4, v1, v2
; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v2, 0
@@ -1258,7 +1258,7 @@ define i64 @v_mul_sub_1_i64_commute(i64 %x, i64 %y) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, -1
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX10-NEXT: v_mul_lo_u32 v4, v2, v1
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, 0
@@ -1326,7 +1326,7 @@ define i64 @v_mul_sub_x_i64(i64 %x, i64 %y) {
; GFX10-NEXT: v_mad_u64_u32 v[2:3], null, v0, v2, 0
; GFX10-NEXT: v_add3_u32 v3, v3, v5, v4
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v0
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
%mul = mul i64 %x, %y
%sub = sub i64 %mul, %x
@@ -1388,7 +1388,7 @@ define i64 @v_mul_add_2_i64(i64 %x, i64 %y) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX10-NEXT: v_mul_lo_u32 v4, v1, v2
; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v2, 0
@@ -1454,7 +1454,7 @@ define i64 @v_mul_sub_2_i64(i64 %x, i64 %y) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, -2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
; GFX10-NEXT: v_mul_lo_u32 v4, v1, v2
; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v2, 0
@@ -3391,7 +3391,7 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v4, v1, v[4:5]
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, s4, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s5, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v2, null, s5, v3, vcc_lo
; GFX10-NEXT: global_store_dword v[1:2], v0, off
; GFX10-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index a63d9f22236d5..4177179b31c06 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -558,11 +558,12 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: global_load_b64 v[2:3], v6, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 1a17efa562b79..4e27cf20d3c98 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -492,7 +492,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
index c04cb89e9527b..6541342d75d32 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
@@ -4,7 +4,7 @@
; GCN-DAG: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], 2, v[4:5]
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 43
; GCN: v_add_co_u32 v[[EXTRA_LO:[0-9]+]], vcc_lo, 0x80, v4
-; GCN: v_add_co_ci_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc_lo, 0, v5, vcc_lo
+; GCN: v_add_co_ci_u32_e64 v[[EXTRA_HI:[0-9]+]], null, 0, v5, vcc_lo
; GCN: global_atomic_csub v{{[0-9]+}}, v[[[LO]]:[[HI]]], [[K]], off offset:512 glc
; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]]
define i32 @shl_base_atomicrmw_global_atomic_csub_ptr(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index a6117578b399b..50056b62b3397 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -7299,9 +7299,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, s3, 0, s4
; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x804
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x80, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:4 ; 16-byte Folded Spill
@@ -7351,7 +7351,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], off offset:244 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v6, vcc_lo, 0x180, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v[6:7], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], off offset:260 ; 16-byte Folded Spill
@@ -7377,7 +7377,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], off offset:372 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v8, vcc_lo, 0x200, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[10:13], v[8:9], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[10:13], off offset:388 ; 16-byte Folded Spill
@@ -7403,7 +7403,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[10:13], off offset:500 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v10, vcc_lo, 0x280, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[12:15], v[10:11], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:516 ; 16-byte Folded Spill
@@ -7429,7 +7429,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:628 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v12, vcc_lo, 0x300, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[14:17], v[12:13], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[14:17], off offset:644 ; 16-byte Folded Spill
@@ -7455,7 +7455,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[14:17], off offset:756 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v14, vcc_lo, 0x380, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v[14:15], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:772 ; 16-byte Folded Spill
@@ -7481,7 +7481,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:884 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v16, vcc_lo, 0x400, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[18:21], v[16:17], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:900 ; 16-byte Folded Spill
@@ -7507,9 +7507,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], off offset:1012 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v18, vcc_lo, 0x480, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v20, vcc_lo, 0x500, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[18:19], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1028 ; 16-byte Folded Spill
@@ -7557,7 +7557,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1252 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[20:21], off offset:2032
; GFX10-FLATSCR-NEXT: v_add_co_u32 v20, vcc_lo, 0x580, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1268 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[20:21], off offset:1920
@@ -7583,7 +7583,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1380 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[20:21], off offset:2032
; GFX10-FLATSCR-NEXT: v_add_co_u32 v20, vcc_lo, 0x600, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1396 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[20:21], off offset:1920
@@ -7609,7 +7609,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1508 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[20:21], off offset:2032
; GFX10-FLATSCR-NEXT: v_add_co_u32 v20, vcc_lo, 0x680, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1524 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[20:21], off offset:1920
@@ -7635,7 +7635,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1636 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[20:21], off offset:2032
; GFX10-FLATSCR-NEXT: v_add_co_u32 v20, vcc_lo, 0x700, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1652 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[20:21], off offset:1920
@@ -7661,9 +7661,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1764 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v[20:21], off offset:2032
; GFX10-FLATSCR-NEXT: v_add_co_u32 v20, vcc_lo, 0x780, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0, v22, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v0
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:1780 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[22:25], v[20:21], off offset:1920
@@ -7713,7 +7713,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:2020 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2032
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v2
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:2036 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1920
@@ -7749,7 +7749,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v6
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v7, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v7, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x884
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -7784,7 +7784,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v8
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v9, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x904
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -7819,7 +7819,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v10
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v11, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v11, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x984
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -7854,7 +7854,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v12
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v13, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v13, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa04
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -7889,7 +7889,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v14
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v15, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa84
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -7924,7 +7924,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v16
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v17, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v17, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb04
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -7959,7 +7959,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v18
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v19, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v19, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb84
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -9031,10 +9031,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x480, v4
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v2, vcc_lo, 0x780, v0
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[2:3], v[7:10], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
@@ -9067,10 +9067,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[2:3], v[7:10], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v2, vcc_lo, 0x400, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX10-FLATSCR-NEXT: v_add_co_u32 v7, vcc_lo, 0x780, v2
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v3, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v3, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[7:8], v[9:12], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[9:12], off, s0 ; 16-byte Folded Reload
@@ -9103,10 +9103,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[7:8], v[9:12], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v7, vcc_lo, 0x380, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX10-FLATSCR-NEXT: v_add_co_u32 v9, vcc_lo, 0x780, v7
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v8, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[9:10], v[11:14], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload
@@ -9139,10 +9139,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[9:10], v[11:14], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v9, vcc_lo, 0x300, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX10-FLATSCR-NEXT: v_add_co_u32 v11, vcc_lo, 0x780, v9
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v10, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v10, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[11:12], v[13:16], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[13:16], off, s0 ; 16-byte Folded Reload
@@ -9175,10 +9175,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[11:12], v[13:16], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v11, vcc_lo, 0x280, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX10-FLATSCR-NEXT: v_add_co_u32 v13, vcc_lo, 0x780, v11
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v12, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v12, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[13:14], v[15:18], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
@@ -9211,10 +9211,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[13:14], v[15:18], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v13, vcc_lo, 0x200, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX10-FLATSCR-NEXT: v_add_co_u32 v15, vcc_lo, 0x780, v13
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v14, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v14, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[15:16], v[17:20], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[17:20], off, s0 ; 16-byte Folded Reload
@@ -9247,10 +9247,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[15:16], v[17:20], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v15, vcc_lo, 0x180, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX10-FLATSCR-NEXT: v_add_co_u32 v17, vcc_lo, 0x780, v15
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, 0, v16, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v16, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[17:18], v[19:22], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
@@ -9283,10 +9283,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[17:18], v[19:22], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v17, vcc_lo, 0x100, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX10-FLATSCR-NEXT: v_add_co_u32 v19, vcc_lo, 0x780, v17
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v18, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v18, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[19:20], v[21:24], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[21:24], off, s0 ; 16-byte Folded Reload
@@ -9318,9 +9318,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[19:20], v[21:24], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:2036 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v19, vcc_lo, 0x80, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v21, vcc_lo, 0x780, v19
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:2020 ; 16-byte Folded Reload
@@ -9346,7 +9346,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1908 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v21, vcc_lo, 0x780, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1892 ; 16-byte Folded Reload
@@ -9372,7 +9372,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1780 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v21, vcc_lo, 0x700, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1764 ; 16-byte Folded Reload
@@ -9398,7 +9398,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1652 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v21, vcc_lo, 0x680, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1636 ; 16-byte Folded Reload
@@ -9424,7 +9424,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1524 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v21, vcc_lo, 0x600, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1508 ; 16-byte Folded Reload
@@ -9450,9 +9450,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:1920
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1396 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: v_add_co_u32 v21, vcc_lo, 0x580, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, vcc_lo, 0x500, v4
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[21:22], v[23:26], off offset:2032
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[23:26], off, off offset:1380 ; 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 877088be27086..40d80f5e83e36 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -1116,7 +1116,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index a3562a452b6f1..8118441df0cfc 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -761,7 +761,8 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
; GFX12-NEXT: global_load_b64 v[2:3], v2, s[4:5]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
@@ -856,10 +857,11 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: global_load_b128 v[4:7], v4, s[4:5]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6
-; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
@@ -988,17 +990,18 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: global_load_b128 v[12:15], v12, s[4:5] offset:16
; GFX12-NEXT: s_wait_loadcnt 0x2
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6
-; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v10, vcc_lo, v10, v14
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v11, v15, vcc_lo
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v11, null, v11, v15, vcc_lo
; GFX12-NEXT: v_sub_co_u32 v8, vcc_lo, v8, v12
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v9, v13, vcc_lo
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v9, null, v9, v13, vcc_lo
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 056e1d038571b..79adc9ead62e1 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -735,8 +735,8 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index e9da27a6f96b6..f4be2bba16da5 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -1426,7 +1426,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1
; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX1030-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1030-NEXT: s_endpgm
@@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1
; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1030-NEXT: s_endpgm
@@ -1755,7 +1755,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1030-NEXT: v_fma_f32 v2, -v3, v1, v2
; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX1030-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1030-NEXT: s_endpgm
@@ -1952,7 +1952,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1030-NEXT: v_fma_f32 v2, -v3, v1, v2
; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
; GFX1030-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1030-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index cfcb2438e1768..90491a07289a0 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -757,15 +757,25 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: v_usubsat_i64:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
-; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_usubsat_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
index 292a281de0296..5404d402828b0 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
@@ -2724,7 +2724,8 @@ define i64 @test_vector_reduce_add_v2i64(<2 x i64> %v) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_vector_reduce_add_v2i64:
@@ -2736,8 +2737,7 @@ define i64 @test_vector_reduce_add_v2i64(<2 x i64> %v) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %v)
@@ -2794,10 +2794,11 @@ define i64 @test_vector_reduce_add_v3i64(<3 x i64> %v) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_vector_reduce_add_v3i64:
@@ -2809,11 +2810,10 @@ define i64 @test_vector_reduce_add_v3i64(<3 x i64> %v) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%res = call i64 @llvm.vector.reduce.add.v3i64(<3 x i64> %v)
@@ -2913,24 +2913,26 @@ define i64 @test_vector_reduce_add_v4i64(<4 x i64> %v) {
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_vector_reduce_add_v4i64:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_vector_reduce_add_v4i64:
@@ -2942,14 +2944,13 @@ define i64 @test_vector_reduce_add_v4i64(<4 x i64> %v) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: test_vector_reduce_add_v4i64:
@@ -2961,14 +2962,13 @@ define i64 @test_vector_reduce_add_v4i64(<4 x i64> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v)
@@ -3132,44 +3132,48 @@ define i64 @test_vector_reduce_add_v8i64(<8 x i64> %v) {
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v13, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v6, vcc_lo, v6, v14
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v15, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v11, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_vector_reduce_add_v8i64:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v11, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v13, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v6, v14
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v15, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_vector_reduce_add_v8i64:
@@ -3181,26 +3185,25 @@ define i64 @test_vector_reduce_add_v8i64(<8 x i64> %v) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v13, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v6, vcc_lo, v6, v14
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v15, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v11, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: test_vector_reduce_add_v8i64:
@@ -3212,26 +3215,25 @@ define i64 @test_vector_reduce_add_v8i64(<8 x i64> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v11, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v13, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v6, v14
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v15, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %v)
@@ -3540,41 +3542,45 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) {
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32
; GFX11-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v10, v26
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v11, v27, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v11, null, v11, v27, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v18
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v19, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v19, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v6, vcc_lo, v6, v22
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v23, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v23, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v8, v24
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v25, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v12, vcc_lo, v12, v28
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v13, v29, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v13, null, v13, v29, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v4, v20
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v21, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v21, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v16
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v17, vcc_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v17, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v13, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v12, vcc_lo, v14, v30
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v15, v31, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v13, null, v15, v31, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v11, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v6, vcc_lo, v6, v12
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v13, vcc_lo
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_vector_reduce_add_v16i64:
@@ -3582,40 +3588,45 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: scratch_load_b32 v31, off, s32
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v16
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v17, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v17, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v18
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v19, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v19, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v20
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v21, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v21, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v6, v22
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v23, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v23, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v8, v24
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v25, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v10, v26
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v11, v27, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v11, null, v11, v27, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v12, v28
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v13, v29, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v13, null, v13, v29, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v11, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v14, v30
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v15, v31, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v15, v31, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v13, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v9, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_vector_reduce_add_v16i64:
@@ -3628,51 +3639,50 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) {
; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32
; GFX12-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v10, v26
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v11, v27, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v11, null, v11, v27, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v18
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v19, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v19, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v6, vcc_lo, v6, v22
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v23, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v23, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v8, v24
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v25, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v12, vcc_lo, v12, v28
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v13, v29, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v13, null, v13, v29, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v4, v20
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v21, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v21, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v16
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v17, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v17, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v13, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v12, vcc_lo, v14, v30
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v15, v31, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v13, null, v15, v31, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v11, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v6, vcc_lo, v6, v12
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v13, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v13, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: test_vector_reduce_add_v16i64:
@@ -3685,51 +3695,50 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) {
; GFX12-GISEL-NEXT: scratch_load_b32 v31, off, s32
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v16
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v17, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v17, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v18
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v19, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v19, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v20
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v21, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v21, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v6, v22
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v23, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v23, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v8, v24
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v25, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v10, v26
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v11, v27, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v11, null, v11, v27, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v12, v28
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v13, v29, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v13, null, v13, v29, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v11, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v14, v30
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v15, v31, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v15, v31, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v13, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v9, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %v)
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 803235a2aa67f..3ba492b0d25b5 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -10,8 +10,8 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; CHECK-NEXT: global_load_b32 v5, v[0:1], off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
@@ -78,8 +78,8 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; CHECK-NEXT: v_lshlrev_b64_e32 v[3:4], 2, v[3:4]
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo
; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
>From 8ad1d93000a48764884cb9a2c0ae77c3f5a1acee Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Thu, 27 Mar 2025 15:20:03 +0100
Subject: [PATCH 2/2] merge checks
---
llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 06d5e48cdab73..73343e1c80f33 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -973,16 +973,13 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
continue;
}
- if (TII->isVOP3(MI.getOpcode()) &&
- TII->hasVALU32BitEncoding(MI.getOpcode())) {
+ // If there is no chance we will shrink it and use VCC as sdst to get
+ // a 32 bit form try to replace dead sdst with NULL.
+ if (TII->isVOP3(MI.getOpcode())) {
tryReplaceDeadSDST(MI);
- }
-
- if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
- // If there is no chance we will shrink it and use VCC as sdst to get
- // a 32 bit form try to replace dead sdst with NULL.
- tryReplaceDeadSDST(MI);
- continue;
+ if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
+ continue;
+ }
}
if (!TII->canShrink(MI, *MRI)) {
More information about the llvm-commits
mailing list