[llvm] c316332 - [Sink] Allow sinking of invariant loads across critical edges
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 5 17:21:52 PDT 2022
Author: Carl Ritson
Date: 2022-10-06T09:21:12+09:00
New Revision: c316332e1789221ec26875d1dc335382b6e68d83
URL: https://github.com/llvm/llvm-project/commit/c316332e1789221ec26875d1dc335382b6e68d83
DIFF: https://github.com/llvm/llvm-project/commit/c316332e1789221ec26875d1dc335382b6e68d83.diff
LOG: [Sink] Allow sinking of invariant loads across critical edges
Invariant loads can always be sunk.
Reviewed By: foad, arsenm
Differential Revision: https://reviews.llvm.org/D135133
Added:
llvm/test/Transforms/Sink/invariant-load.ll
Modified:
llvm/lib/Transforms/Scalar/Sink.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
llvm/test/CodeGen/AMDGPU/madak.ll
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
llvm/test/CodeGen/AMDGPU/sdiv64.ll
llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
llvm/test/CodeGen/AMDGPU/srem64.ll
llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
llvm/test/CodeGen/AMDGPU/udiv64.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
index e8fde53005f0b..dad45c47e0c24 100644
--- a/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -79,7 +79,8 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
// We cannot sink a load across a critical edge - there may be stores in
// other code paths.
- if (Inst->mayReadFromMemory())
+ if (Inst->mayReadFromMemory() &&
+ !Inst->hasMetadata(LLVMContext::MD_invariant_load))
return false;
// We don't want to sink across a critical edge if we don't dominate the
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 908874e073e73..99ef58147896c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1354,20 +1354,20 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7-NEXT: s_cbranch_execz .LBB13_2
; GFX7-NEXT: ; %bb.1: ; %bb
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x14
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x14
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_cmp_lg_u32 s0, 0
+; GFX7-NEXT: s_cmp_lg_u32 s4, 0
; GFX7-NEXT: s_cselect_b32 s6, 1, 0
; GFX7-NEXT: .LBB13_2: ; %exit
; GFX7-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX7-NEXT: s_and_b32 s0, 1, s6
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX7-NEXT: s_mov_b32 s6, -1
@@ -1382,30 +1382,30 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2]
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB13_2
; GFX8-NEXT: ; %bb.1: ; %bb
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x50
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_cmp_lg_u32 s0, 0
-; GFX8-NEXT: s_cselect_b32 s6, 1, 0
+; GFX8-NEXT: s_cmp_lg_u32 s4, 0
+; GFX8-NEXT: s_cselect_b32 s4, 1, 0
; GFX8-NEXT: .LBB13_2: ; %exit
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_u32 s0, s2, 8
-; GFX8-NEXT: s_addc_u32 s1, s3, 0
-; GFX8-NEXT: s_and_b32 s2, 1, s6
+; GFX8-NEXT: s_add_u32 s0, s0, 8
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_and_b32 s2, 1, s4
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_nop 2
@@ -1420,29 +1420,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28
; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10_W32-NEXT: s_mov_b32 s5, 0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3]
; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX10_W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10_W32-NEXT: s_mov_b32 s2, 0
+; GFX10_W32-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX10_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W32-NEXT: ; %bb.1: ; %bb
-; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x50
+; GFX10_W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10_W32-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0
-; GFX10_W32-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0
; GFX10_W32-NEXT: .LBB13_2: ; %exit
-; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10_W32-NEXT: s_and_b32 s0, 1, s5
-; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10_W32-NEXT: s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] offset:8
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] offset:8
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
@@ -1450,58 +1450,58 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28
; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10_W64-NEXT: s_mov_b32 s6, 0
+; GFX10_W64-NEXT: s_mov_b32 s4, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3]
; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX10_W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10_W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W64-NEXT: ; %bb.1: ; %bb
-; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x50
+; GFX10_W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x50
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10_W64-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0
-; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0
+; GFX10_W64-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10_W64-NEXT: s_cselect_b32 s4, 1, 0
; GFX10_W64-NEXT: .LBB13_2: ; %exit
-; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX10_W64-NEXT: s_and_b32 s0, 1, s6
-; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX10_W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10_W64-NEXT: s_and_b32 s2, 1, s4
+; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] offset:8
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] offset:8
; GFX10_W64-NEXT: s_endpgm
;
; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX11_W32: ; %bb.0: ; %entry
; GFX11_W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x28
; GFX11_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX11_W32-NEXT: s_mov_b32 s5, 0
-; GFX11_W32-NEXT: s_mov_b32 s4, exec_lo
; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W32-NEXT: global_load_b96 v[1:3], v1, s[2:3]
-; GFX11_W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11_W32-NEXT: s_mov_b32 s2, 0
+; GFX11_W32-NEXT: s_mov_b32 s3, exec_lo
; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W32-NEXT: ; %bb.1: ; %bb
-; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x50
+; GFX11_W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x50
; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11_W32-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11_W32-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11_W32-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11_W32-NEXT: s_cselect_b32 s2, 1, 0
; GFX11_W32-NEXT: .LBB13_2: ; %exit
-; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX11_W32-NEXT: s_and_b32 s0, 1, s5
-; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W32-NEXT: s_and_b32 s2, 1, s2
+; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
; GFX11_W32-NEXT: s_waitcnt vmcnt(0)
; GFX11_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT: global_store_b32 v1, v0, s[2:3] offset:8
+; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] offset:8
; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11_W32-NEXT: s_endpgm
;
@@ -1509,29 +1509,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
; GFX11_W64: ; %bb.0: ; %entry
; GFX11_W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x28
; GFX11_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX11_W64-NEXT: s_mov_b32 s6, 0
-; GFX11_W64-NEXT: s_mov_b64 s[4:5], exec
+; GFX11_W64-NEXT: s_mov_b32 s4, 0
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11_W64-NEXT: global_load_b96 v[1:3], v1, s[2:3]
-; GFX11_W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11_W64-NEXT: s_mov_b64 s[2:3], exec
; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W64-NEXT: ; %bb.1: ; %bb
-; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x50
+; GFX11_W64-NEXT: s_load_b64 s[4:5], s[0:1], 0x50
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11_W64-NEXT: s_load_b32 s4, s[4:5], 0x0
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT: s_cmp_lg_u32 s0, 0
-; GFX11_W64-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11_W64-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11_W64-NEXT: s_cselect_b32 s4, 1, 0
; GFX11_W64-NEXT: .LBB13_2: ; %exit
-; GFX11_W64-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX11_W64-NEXT: s_and_b32 s0, 1, s6
-; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX11_W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W64-NEXT: s_and_b32 s2, 1, s4
+; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX11_W64-NEXT: s_waitcnt vmcnt(0)
; GFX11_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT: global_store_b32 v1, v0, s[2:3] offset:8
+; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] offset:8
; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11_W64-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index e4dfe024568a3..fd2251bff0b1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -5,17 +5,17 @@
define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GFX10-LABEL: test_wave32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB0_2
; GFX10-NEXT: ; %bb.1: ; %mid
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: .LBB0_2: ; %bb
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
@@ -25,9 +25,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
;
; GFX11-LABEL: test_wave32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_cbranch_scc1 .LBB0_2
@@ -36,6 +34,8 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB0_2: ; %bb
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
index af7e551cbb1bd..1e39c6395ca7d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
@@ -4,16 +4,17 @@
define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) {
; GCN-LABEL: test_wave64:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[4:5], 0x0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
; GCN-NEXT: ; %bb.1: ; %mid
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: .LBB0_2: ; %bb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index ca6df8ccf4b1f..bf0758a645eba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -27,14 +27,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cbranch_scc1 .LBB0_3
; GCN-NEXT: ; %bb.2: ; %bb.1
-; GCN-NEXT: s_load_dword s6, s[4:5], 0x10
-; GCN-NEXT: s_add_u32 s7, s32, 0x1000
+; GCN-NEXT: s_load_dword s7, s[4:5], 0x10
+; GCN-NEXT: s_add_u32 s6, s32, 0x1000
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s6, s6, 2
-; GCN-NEXT: s_add_u32 s6, s7, s6
+; GCN-NEXT: s_lshl_b32 s7, s7, 2
+; GCN-NEXT: s_add_u32 s6, s6, s7
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
; GCN-NEXT: v_mov_b32_e32 v2, s6
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 0d1cb169eaee2..c7bd302850671 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -17,200 +17,200 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 imm
define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
; GFX6-LABEL: add_i32_constant:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB0_2
; GFX6-NEXT: ; %bb.1:
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX6-NEXT: s_mul_i32 s0, s0, 5
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX6-NEXT: s_mul_i32 s4, s4, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB0_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0
-; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
+; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB0_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8-NEXT: s_mul_i32 s0, s0, 5
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8-NEXT: s_mul_i32 s4, s4, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB0_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9-NEXT: s_mul_i32 s0, s0, 5
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: s_mul_i32 s4, s4, 5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB0_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
-; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB0_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX10W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX10W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_mov_b32 s5, exec_lo
+; GFX10W32-NEXT: s_mov_b32 s3, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB0_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5
-; GFX10W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX10W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB0_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB0_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: add_i32_constant:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s2, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB0_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
+; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB0_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -222,159 +222,155 @@ entry:
define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
; GFX6-LABEL: add_i32_uniform:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB1_2
; GFX6-NEXT: ; %bb.1:
-; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mul_i32 s0, s8, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX6-NEXT: s_mul_i32 s4, s6, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB1_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX8-NEXT: s_mov_b64 s[4:5], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB1_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s0, s8, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX8-NEXT: s_mul_i32 s4, s6, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB1_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX9-NEXT: s_mov_b64 s[4:5], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB1_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s0, s8, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX9-NEXT: s_mul_i32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB1_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_clause 0x1
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB1_2
; GFX10W64-NEXT: ; %bb.1:
-; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: s_mul_i32 s0, s8, s0
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX10W64-NEXT: s_mul_i32 s4, s6, s4
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_clause 0x1
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44
-; GFX10W32-NEXT: s_mov_b32 s6, exec_lo
+; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44
+; GFX10W32-NEXT: s_mov_b32 s4, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB1_2
; GFX10W32-NEXT: ; %bb.1:
; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: s_mul_i32 s0, s4, s0
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_mul_i32 s4, s2, s4
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s4
; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_clause 0x1
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44
+; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -382,54 +378,54 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB1_2
; GFX11W64-NEXT: ; %bb.1:
-; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_mul_i32 s0, s8, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s6, s4
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB1_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: add_i32_uniform:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_clause 0x1
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44
-; GFX11W32-NEXT: s_mov_b32 s6, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44
+; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB1_2
; GFX11W32-NEXT: ; %bb.1:
; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: s_mul_i32 s0, s4, s0
+; GFX11W32-NEXT: s_mul_i32 s4, s2, s4
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s4
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB1_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -453,17 +449,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
;
; GFX8-LABEL: add_i32_varying_vdata:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -475,44 +470,44 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT: v_readlane_b32 s6, v2, 63
+; GFX8-NEXT: v_readlane_b32 s4, v2, 63
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB2_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB2_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dword v[3:4], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying_vdata:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -524,28 +519,29 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT: v_readlane_b32 s6, v2, 63
+; GFX9-NEXT: v_readlane_b32 s4, v2, 63
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB2_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -566,43 +562,41 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W64-NEXT: v_mov_b32_e32 v2, s4
; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15
; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10W64-NEXT: ; implicit-def: $vgpr0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB2_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX10W64-NEXT: v_mov_b32_e32 v0, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB2_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, v3
; GFX10W64-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -613,44 +607,42 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_mov_b32_e32 v2, v1
; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31
+; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10W32-NEXT: ; implicit-def: $vgpr0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB2_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX10W32-NEXT: s_mov_b32 s5, s6
+; GFX10W32-NEXT: s_mov_b32 s3, s4
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
; GFX10W32-NEXT: .LBB2_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, v3
; GFX10W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -676,46 +668,44 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W64-NEXT: v_mov_b32_e32 v2, s4
; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX11W64-NEXT: ; implicit-def: $vgpr0
-; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX11W64-NEXT: s_cbranch_execz .LBB2_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX11W64-NEXT: v_mov_b32_e32 v0, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB2_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, v3
; GFX11W64-NEXT: v_mov_b32_e32 v4, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
@@ -726,8 +716,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -735,42 +726,39 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_mov_b32_e32 v2, v1
; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11W32-NEXT: ; implicit-def: $vgpr0
-; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX11W32-NEXT: s_cbranch_execz .LBB2_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX11W32-NEXT: s_mov_b32 s5, s6
+; GFX11W32-NEXT: s_mov_b32 s3, s4
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: v_mov_b32_e32 v0, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
+; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB2_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, v3
; GFX11W32-NEXT: v_mov_b32_e32 v4, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -797,17 +785,16 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out,
;
; GFX8-LABEL: struct_add_i32_varying_vdata:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -819,46 +806,46 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out,
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT: v_readlane_b32 s6, v2, 63
+; GFX8-NEXT: v_readlane_b32 s4, v2, 63
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB3_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_load_dword s7, s[0:1], 0x44
+; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB3_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dword v[3:4], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: struct_add_i32_varying_vdata:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -870,30 +857,31 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out,
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT: v_readlane_b32 s6, v2, 63
+; GFX9-NEXT: v_readlane_b32 s4, v2, 63
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB3_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dword s7, s[0:1], 0x44
+; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB3_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: struct_add_i32_varying_vdata:
@@ -914,46 +902,44 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out,
; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W64-NEXT: v_mov_b32_e32 v2, s4
; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15
; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10W64-NEXT: ; implicit-def: $vgpr0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB3_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_clause 0x1
-; GFX10W64-NEXT: s_load_dword s7, s[0:1], 0x44
+; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX10W64-NEXT: v_mov_b32_e32 v0, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mov_b32_e32 v4, s7
+; GFX10W64-NEXT: v_mov_b32_e32 v4, s5
; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
; GFX10W64-NEXT: .LBB3_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, v3
; GFX10W64-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: struct_add_i32_varying_vdata:
@@ -964,47 +950,45 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out,
; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_mov_b32_e32 v2, v1
; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31
+; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10W32-NEXT: ; implicit-def: $vgpr0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB3_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_mov_b32 s5, s6
+; GFX10W32-NEXT: s_mov_b32 s3, s4
; GFX10W32-NEXT: s_clause 0x1
-; GFX10W32-NEXT: s_load_dword s6, s[0:1], 0x44
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: v_mov_b32_e32 v0, s5
+; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mov_b32_e32 v4, s6
-; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
+; GFX10W32-NEXT: v_mov_b32_e32 v4, s8
+; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[4:7], 0 idxen glc
; GFX10W32-NEXT: .LBB3_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, v3
; GFX10W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: struct_add_i32_varying_vdata:
@@ -1030,49 +1014,47 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out,
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W64-NEXT: v_mov_b32_e32 v2, s4
; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX11W64-NEXT: ; implicit-def: $vgpr0
-; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX11W64-NEXT: s_cbranch_execz .LBB3_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_clause 0x1
-; GFX11W64-NEXT: s_load_b32 s7, s[0:1], 0x44
+; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX11W64-NEXT: v_mov_b32_e32 v0, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: v_mov_b32_e32 v4, s7
+; GFX11W64-NEXT: v_mov_b32_e32 v4, s5
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB3_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, v3
; GFX11W64-NEXT: v_mov_b32_e32 v4, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
@@ -1083,8 +1065,9 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out,
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1092,45 +1075,42 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out,
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_mov_b32_e32 v2, v1
; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11W32-NEXT: ; implicit-def: $vgpr0
-; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX11W32-NEXT: s_cbranch_execz .LBB3_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_mov_b32 s5, s6
+; GFX11W32-NEXT: s_mov_b32 s3, s4
; GFX11W32-NEXT: s_clause 0x1
-; GFX11W32-NEXT: s_load_b32 s6, s[0:1], 0x44
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: v_mov_b32_e32 v0, s5
+; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: v_mov_b32_e32 v0, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: v_mov_b32_e32 v4, s6
-; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc
+; GFX11W32-NEXT: v_mov_b32_e32 v4, s8
+; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB3_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, v3
; GFX11W32-NEXT: v_mov_b32_e32 v4, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -1213,207 +1193,207 @@ entry:
define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
; GFX6-LABEL: sub_i32_constant:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB5_2
; GFX6-NEXT: ; %bb.1:
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX6-NEXT: s_mul_i32 s0, s0, 5
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX6-NEXT: s_mul_i32 s4, s4, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB5_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB5_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8-NEXT: s_mul_i32 s0, s0, 5
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8-NEXT: s_mul_i32 s4, s4, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB5_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9-NEXT: s_mul_i32 s0, s0, 5
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: s_mul_i32 s4, s4, 5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB5_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
-; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB5_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX10W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX10W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_mov_b32 s5, exec_lo
+; GFX10W32-NEXT: s_mov_b32 s3, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB5_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5
-; GFX10W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX10W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
+; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB5_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB5_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: sub_i32_constant:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s2, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB5_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
+; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB5_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -1425,161 +1405,157 @@ entry:
define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
; GFX6-LABEL: sub_i32_uniform:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB6_2
; GFX6-NEXT: ; %bb.1:
-; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mul_i32 s0, s8, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX6-NEXT: s_mul_i32 s4, s6, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB6_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX8-NEXT: s_mov_b64 s[4:5], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB6_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s0, s8, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX8-NEXT: s_mul_i32 s4, s6, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB6_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX9-NEXT: s_mov_b64 s[4:5], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s0, s8, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX9-NEXT: s_mul_i32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB6_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_clause 0x1
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB6_2
; GFX10W64-NEXT: ; %bb.1:
-; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: s_mul_i32 s0, s8, s0
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX10W64-NEXT: s_mul_i32 s4, s6, s4
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB6_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_clause 0x1
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44
-; GFX10W32-NEXT: s_mov_b32 s6, exec_lo
+; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44
+; GFX10W32-NEXT: s_mov_b32 s4, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB6_2
; GFX10W32-NEXT: ; %bb.1:
; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: s_mul_i32 s0, s4, s0
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_mul_i32 s4, s2, s4
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s4
; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_clause 0x1
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44
+; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1587,56 +1563,56 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB6_2
; GFX11W64-NEXT: ; %bb.1:
-; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_mul_i32 s0, s8, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s6, s4
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB6_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: sub_i32_uniform:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_clause 0x1
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44
-; GFX11W32-NEXT: s_mov_b32 s6, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44
+; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB6_2
; GFX11W32-NEXT: ; %bb.1:
; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: s_mul_i32 s0, s4, s0
+; GFX11W32-NEXT: s_mul_i32 s4, s2, s4
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s4
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB6_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -1660,17 +1636,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
;
; GFX8-LABEL: sub_i32_varying_vdata:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1682,44 +1657,44 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT: v_readlane_b32 s6, v2, 63
+; GFX8-NEXT: v_readlane_b32 s4, v2, 63
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB7_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB7_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dword v[3:4], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying_vdata:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1731,28 +1706,29 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT: v_readlane_b32 s6, v2, 63
+; GFX9-NEXT: v_readlane_b32 s4, v2, 63
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB7_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB7_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -1773,43 +1749,41 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W64-NEXT: v_mov_b32_e32 v2, s4
; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15
; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10W64-NEXT: ; implicit-def: $vgpr0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB7_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX10W64-NEXT: v_mov_b32_e32 v0, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB7_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, v3
; GFX10W64-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -1820,44 +1794,42 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_mov_b32_e32 v2, v1
; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31
+; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10W32-NEXT: ; implicit-def: $vgpr0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB7_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX10W32-NEXT: s_mov_b32 s5, s6
+; GFX10W32-NEXT: s_mov_b32 s3, s4
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
; GFX10W32-NEXT: .LBB7_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, v3
; GFX10W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -1883,46 +1855,44 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W64-NEXT: v_mov_b32_e32 v2, s4
; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX11W64-NEXT: ; implicit-def: $vgpr0
-; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX11W64-NEXT: s_cbranch_execz .LBB7_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX11W64-NEXT: v_mov_b32_e32 v0, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB7_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, v3
; GFX11W64-NEXT: v_mov_b32_e32 v4, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
@@ -1933,8 +1903,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1942,42 +1913,39 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_mov_b32_e32 v2, v1
; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11W32-NEXT: ; implicit-def: $vgpr0
-; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX11W32-NEXT: s_cbranch_execz .LBB7_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX11W32-NEXT: s_mov_b32 s5, s6
+; GFX11W32-NEXT: s_mov_b32 s3, s4
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: v_mov_b32_e32 v0, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
+; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB7_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, v3
; GFX11W32-NEXT: v_mov_b32_e32 v4, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 94dfe2325c5fe..ea9022c83c4dc 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -19,107 +19,104 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
;
; GFX7LESS-LABEL: add_i32_constant:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB0_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB0_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8-NEXT: s_mul_i32 s2, s2, 5
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8-NEXT: s_mul_i32 s4, s4, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB0_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 1
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT: s_mul_i32 s2, s2, 5
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: s_mul_i32 s4, s4, 5
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB0_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 1
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mul_i32 s2, s2, 5
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: s_mul_i32 s4, s4, 5
+; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2
@@ -127,7 +124,8 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
@@ -138,7 +136,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: add_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
@@ -158,6 +155,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
@@ -168,28 +166,28 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
;
; GFX1164-LABEL: add_i32_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_mul_i32 s2, s2, 5
+; GFX1164-NEXT: s_mul_i32 s4, s4, 5
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB0_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -202,7 +200,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
;
; GFX1132-LABEL: add_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
@@ -222,6 +219,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB0_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -242,117 +240,115 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
;
; GFX7LESS-LABEL: add_i32_uniform:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2
+; GFX7LESS-NEXT: s_mul_i32 s4, s6, s4
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB1_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB1_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s2, s6, s2
+; GFX8-NEXT: s_mul_i32 s4, s6, s4
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB1_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB1_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s2, s6, s2
+; GFX9-NEXT: s_mul_i32 s4, s6, s4
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB1_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB1_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s2, s6, s2
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: s_mul_i32 s4, s6, s4
+; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2
@@ -360,32 +356,31 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
-; GFX1064-NEXT: s_mov_b32 s6, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB1_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s1, s2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: s_mul_i32 s4, s2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2
@@ -393,84 +388,83 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
-; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_mul_i32 s4, s6, s4
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB1_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
-; GFX1164-NEXT: s_mov_b32 s6, -1
-; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
-; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132-NEXT: s_mul_i32 s4, s2, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB1_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
-; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
@@ -497,7 +491,6 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: add_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
@@ -536,19 +529,18 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB2_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
@@ -586,13 +578,13 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB2_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_add_u32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -614,11 +606,8 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
@@ -649,6 +638,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB2_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0
@@ -665,16 +655,13 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -699,6 +686,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB2_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0
@@ -730,12 +718,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -767,6 +752,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB2_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -784,8 +770,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -793,22 +780,17 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -824,6 +806,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB2_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1075,7 +1058,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-LABEL: add_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
@@ -1093,22 +1075,22 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB4_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: add_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
@@ -1130,18 +1112,19 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 2
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[4:5], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
@@ -1162,18 +1145,19 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
@@ -1194,6 +1178,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB4_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
@@ -1205,7 +1190,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1032-LABEL: add_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
@@ -1225,6 +1209,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB4_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
@@ -1236,7 +1221,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1164-LABEL: add_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
@@ -1258,6 +1242,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB4_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1271,7 +1256,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1132-LABEL: add_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
@@ -1292,6 +1276,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB4_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1677,110 +1662,107 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
;
; GFX7LESS-LABEL: sub_i32_constant:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB7_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB7_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX8-NEXT: s_mul_i32 s2, s2, 5
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8-NEXT: s_mul_i32 s4, s4, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB7_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB7_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT: s_mul_i32 s2, s2, 5
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: s_mul_i32 s4, s4, 5
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB7_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB7_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mul_i32 s2, s2, 5
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: s_mul_i32 s4, s4, 5
+; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2
@@ -1788,7 +1770,8 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB7_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
@@ -1800,7 +1783,6 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: sub_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
@@ -1820,6 +1802,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB7_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
@@ -1831,28 +1814,28 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
;
; GFX1164-LABEL: sub_i32_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB7_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_mul_i32 s2, s2, 5
+; GFX1164-NEXT: s_mul_i32 s4, s4, 5
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB7_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -1866,7 +1849,6 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
;
; GFX1132-LABEL: sub_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
@@ -1886,6 +1868,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB7_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
@@ -1907,117 +1890,115 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
;
; GFX7LESS-LABEL: sub_i32_uniform:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
-; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2
; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2
+; GFX7LESS-NEXT: s_mul_i32 s4, s6, s4
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: s_mov_b32 m0, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB8_2:
-; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c
-; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB8_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s2, s6, s2
+; GFX8-NEXT: s_mul_i32 s4, s6, s4
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB8_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB8_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s2, s6, s2
+; GFX9-NEXT: s_mul_i32 s4, s6, s4
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB8_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB8_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s2, s6, s2
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: s_mul_i32 s4, s6, s4
+; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2
@@ -2025,33 +2006,32 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB8_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s6, -1
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB8_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s1, s2, s1
-; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: s_mul_i32 s4, s2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2
@@ -2059,87 +2039,86 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB8_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s6, -1
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c
+; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB8_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_mul_i32 s4, s6, s4
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB8_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
-; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB8_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132-NEXT: s_mul_i32 s4, s2, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB8_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
entry:
@@ -2166,7 +2145,6 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: sub_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
@@ -2205,19 +2183,18 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB9_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
@@ -2255,13 +2232,13 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB9_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -2283,11 +2260,8 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
@@ -2318,6 +2292,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB9_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0
@@ -2334,16 +2309,13 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2368,6 +2340,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB9_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0
@@ -2399,12 +2372,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -2436,6 +2406,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB9_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2453,8 +2424,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -2462,22 +2434,17 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -2493,6 +2460,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB9_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2744,7 +2712,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-LABEL: sub_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
@@ -2762,22 +2729,22 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB11_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_mov_b64 s[4:5], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
@@ -2796,22 +2763,22 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB11_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[4:5], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
@@ -2829,22 +2796,22 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB11_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
@@ -2865,6 +2832,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB11_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
@@ -2879,7 +2847,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1032-LABEL: sub_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
@@ -2899,6 +2866,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB11_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
@@ -2913,7 +2881,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1164-LABEL: sub_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
@@ -2935,6 +2902,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB11_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
@@ -2951,7 +2919,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1132-LABEL: sub_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
@@ -2972,6 +2939,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB11_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
@@ -3386,7 +3354,6 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: and_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -3425,19 +3392,18 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB14_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_and_b32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_and_b32_e32 v0, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: and_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -3475,13 +3441,13 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB14_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_and_b32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_and_b32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3503,11 +3469,8 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
@@ -3538,6 +3501,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB14_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0
@@ -3554,16 +3518,13 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT: v_mov_b32_e32 v3, -1
; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_mov_b32_e32 v3, -1
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3588,6 +3549,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB14_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0
@@ -3619,12 +3581,9 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -3656,6 +3615,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB14_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3673,8 +3633,9 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, -1
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: v_mov_b32_e32 v3, -1
; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -3682,22 +3643,17 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: v_mov_b32_e32 v3, -1
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -3713,6 +3669,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB14_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3747,7 +3704,6 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: or_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
@@ -3786,19 +3742,18 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB15_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_or_b32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_or_b32_e32 v0, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: or_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
@@ -3836,13 +3791,13 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB15_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_or_b32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_or_b32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3864,11 +3819,8 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
@@ -3899,6 +3851,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB15_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0
@@ -3915,16 +3868,13 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3949,6 +3899,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB15_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0
@@ -3980,12 +3931,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -4017,6 +3965,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB15_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4034,8 +3983,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4043,22 +3993,17 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -4074,6 +4019,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB15_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4108,7 +4054,6 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: xor_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
@@ -4147,19 +4092,18 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB16_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: xor_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
@@ -4197,13 +4141,13 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB16_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4225,11 +4169,8 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
@@ -4260,6 +4201,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB16_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0
@@ -4276,16 +4218,13 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4310,6 +4249,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB16_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0
@@ -4341,12 +4281,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -4378,6 +4315,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB16_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4395,8 +4333,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4404,22 +4343,17 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -4435,6 +4369,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB16_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4469,7 +4404,6 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: max_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -4508,19 +4442,18 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB17_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_max_i32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_max_i32_e32 v0, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: max_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -4558,13 +4491,13 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB17_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_max_i32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_max_i32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4586,11 +4519,8 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
@@ -4621,6 +4551,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB17_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0
@@ -4637,16 +4568,13 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4671,6 +4599,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB17_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0
@@ -4702,12 +4631,9 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -4739,6 +4665,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB17_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4756,8 +4683,9 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -4765,22 +4693,17 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -4796,6 +4719,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB17_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4817,7 +4741,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
;
; GFX7LESS-LABEL: max_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -4834,25 +4757,25 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB18_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4
; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: max_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -4870,24 +4793,25 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: .LBB18_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: max_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -4904,24 +4828,25 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: .LBB18_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: max_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -4940,6 +4865,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB18_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
@@ -4955,7 +4881,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1032-LABEL: max_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -4973,6 +4898,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB18_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
@@ -4988,7 +4914,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1164-LABEL: max_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -5007,6 +4932,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB18_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
@@ -5024,7 +4950,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1132-LABEL: max_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -5041,6 +4966,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB18_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
@@ -5079,7 +5005,6 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: min_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -5118,19 +5043,18 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB19_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_min_i32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_min_i32_e32 v0, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: min_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -5168,13 +5092,13 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB19_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_min_i32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_min_i32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -5196,11 +5120,8 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
@@ -5231,6 +5152,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB19_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0
@@ -5247,16 +5169,13 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5281,6 +5200,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB19_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0
@@ -5312,12 +5232,9 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -5349,6 +5266,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB19_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5366,8 +5284,9 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -5375,22 +5294,17 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -5406,6 +5320,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB19_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5427,7 +5342,6 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
;
; GFX7LESS-LABEL: min_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -5444,25 +5358,25 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB20_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4
; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: min_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -5482,6 +5396,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
@@ -5489,15 +5404,15 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: min_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -5516,6 +5431,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
@@ -5523,15 +5439,15 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: min_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -5550,6 +5466,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB20_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
@@ -5557,15 +5474,14 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: min_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5583,6 +5499,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB20_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
@@ -5590,15 +5507,14 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: min_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -5617,6 +5533,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB20_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
@@ -5625,8 +5542,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5634,7 +5551,6 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1132-LABEL: min_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -5651,6 +5567,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB20_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
@@ -5659,8 +5576,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5689,7 +5606,6 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: umax_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
@@ -5728,19 +5644,18 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB21_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_max_u32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_max_u32_e32 v0, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umax_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
@@ -5778,13 +5693,13 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB21_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_max_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_max_u32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -5806,11 +5721,8 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
@@ -5841,6 +5753,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB21_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0
@@ -5857,16 +5770,13 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5891,6 +5801,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB21_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0
@@ -5922,12 +5833,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -5959,6 +5867,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB21_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -5976,8 +5885,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5985,22 +5895,17 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -6016,6 +5921,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB21_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -6037,7 +5943,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
;
; GFX7LESS-LABEL: umax_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6054,24 +5959,24 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB22_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: umax_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6088,24 +5993,24 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB22_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umax_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6121,24 +6026,24 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB22_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umax_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6157,6 +6062,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB22_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
@@ -6172,7 +6078,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1032-LABEL: umax_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6190,6 +6095,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB22_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
@@ -6205,7 +6111,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1164-LABEL: umax_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -6224,6 +6129,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB22_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
@@ -6241,7 +6147,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1132-LABEL: umax_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -6258,6 +6163,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB22_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
@@ -6296,7 +6202,6 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
;
; GFX8-LABEL: umin_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -6335,19 +6240,18 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB23_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_min_u32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_min_u32_e32 v0, s4, v0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umin_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -6385,13 +6289,13 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB23_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_min_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_min_u32_e32 v0, s4, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -6413,11 +6317,8 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
@@ -6448,6 +6349,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB23_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0
@@ -6464,16 +6366,13 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032-NEXT: v_mov_b32_e32 v3, -1
; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT: v_mov_b32_e32 v3, -1
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6498,6 +6397,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB23_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0
@@ -6529,12 +6429,9 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -6566,6 +6463,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB23_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -6583,8 +6481,9 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, -1
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: v_mov_b32_e32 v3, -1
; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -6592,22 +6491,17 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: v_mov_b32_e32 v3, -1
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -6623,6 +6517,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB23_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -6644,7 +6539,6 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
;
; GFX7LESS-LABEL: umin_i64_constant:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6661,10 +6555,12 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: .LBB24_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
@@ -6672,13 +6568,11 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: umin_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6695,6 +6589,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB24_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
@@ -6704,15 +6599,14 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umin_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6728,6 +6622,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB24_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
@@ -6737,15 +6632,14 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umin_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6764,6 +6658,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: .LBB24_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -6771,15 +6666,14 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umin_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6797,6 +6691,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: .LBB24_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
@@ -6804,15 +6699,14 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umin_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -6831,6 +6725,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB24_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -6839,8 +6734,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6848,7 +6743,6 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
;
; GFX1132-LABEL: umin_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -6865,6 +6759,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB24_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
@@ -6873,8 +6768,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 63ca1eb59748d..7e4e430082c49 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -16,200 +16,200 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)
define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
; GFX6-LABEL: add_i32_constant:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB0_2
; GFX6-NEXT: ; %bb.1:
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX6-NEXT: s_mul_i32 s0, s0, 5
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX6-NEXT: s_mul_i32 s4, s4, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB0_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0
-; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
+; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB0_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8-NEXT: s_mul_i32 s0, s0, 5
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8-NEXT: s_mul_i32 s4, s4, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB0_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9-NEXT: s_mul_i32 s0, s0, 5
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: s_mul_i32 s4, s4, 5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB0_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
-; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB0_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX10W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX10W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_mov_b32 s5, exec_lo
+; GFX10W32-NEXT: s_mov_b32 s3, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB0_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5
-; GFX10W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX10W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX10W32-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB0_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB0_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: add_i32_constant:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s2, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB0_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
+; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB0_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -221,159 +221,155 @@ entry:
define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
; GFX6-LABEL: add_i32_uniform:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB1_2
; GFX6-NEXT: ; %bb.1:
-; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mul_i32 s0, s8, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX6-NEXT: s_mul_i32 s4, s6, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB1_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX8-NEXT: s_mov_b64 s[4:5], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB1_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s0, s8, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX8-NEXT: s_mul_i32 s4, s6, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB1_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX9-NEXT: s_mov_b64 s[4:5], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB1_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s0, s8, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX9-NEXT: s_mul_i32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB1_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_clause 0x1
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB1_2
; GFX10W64-NEXT: ; %bb.1:
-; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: s_mul_i32 s0, s8, s0
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX10W64-NEXT: s_mul_i32 s4, s6, s4
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_clause 0x1
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44
-; GFX10W32-NEXT: s_mov_b32 s6, exec_lo
+; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44
+; GFX10W32-NEXT: s_mov_b32 s4, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB1_2
; GFX10W32-NEXT: ; %bb.1:
; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: s_mul_i32 s0, s4, s0
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_mul_i32 s4, s2, s4
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s4
; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_clause 0x1
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44
+; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -381,54 +377,54 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB1_2
; GFX11W64-NEXT: ; %bb.1:
-; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_mul_i32 s0, s8, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s6, s4
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB1_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: add_i32_uniform:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_clause 0x1
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44
-; GFX11W32-NEXT: s_mov_b32 s6, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44
+; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB1_2
; GFX11W32-NEXT: ; %bb.1:
; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: s_mul_i32 s0, s4, s0
+; GFX11W32-NEXT: s_mul_i32 s4, s2, s4
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s4
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB1_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -452,17 +448,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
;
; GFX8-LABEL: add_i32_varying_vdata:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -474,44 +469,44 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT: v_readlane_b32 s6, v2, 63
+; GFX8-NEXT: v_readlane_b32 s4, v2, 63
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB2_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB2_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dword v[3:4], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying_vdata:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -523,28 +518,29 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT: v_readlane_b32 s6, v2, 63
+; GFX9-NEXT: v_readlane_b32 s4, v2, 63
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB2_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -565,43 +561,41 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W64-NEXT: v_mov_b32_e32 v2, s4
; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15
; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10W64-NEXT: ; implicit-def: $vgpr0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB2_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX10W64-NEXT: v_mov_b32_e32 v0, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB2_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, v3
; GFX10W64-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -612,44 +606,42 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_mov_b32_e32 v2, v1
; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31
+; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10W32-NEXT: ; implicit-def: $vgpr0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB2_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX10W32-NEXT: s_mov_b32 s5, s6
+; GFX10W32-NEXT: s_mov_b32 s3, s4
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
; GFX10W32-NEXT: .LBB2_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, v3
; GFX10W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -675,46 +667,44 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W64-NEXT: v_mov_b32_e32 v2, s4
; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX11W64-NEXT: ; implicit-def: $vgpr0
-; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX11W64-NEXT: s_cbranch_execz .LBB2_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX11W64-NEXT: v_mov_b32_e32 v0, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB2_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, v3
; GFX11W64-NEXT: v_mov_b32_e32 v4, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
@@ -725,8 +715,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -734,42 +725,39 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_mov_b32_e32 v2, v1
; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11W32-NEXT: ; implicit-def: $vgpr0
-; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX11W32-NEXT: s_cbranch_execz .LBB2_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX11W32-NEXT: s_mov_b32 s5, s6
+; GFX11W32-NEXT: s_mov_b32 s3, s4
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: v_mov_b32_e32 v0, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
+; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB2_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, v3
; GFX11W32-NEXT: v_mov_b32_e32 v4, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -852,207 +840,207 @@ entry:
define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
; GFX6-LABEL: sub_i32_constant:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB4_2
; GFX6-NEXT: ; %bb.1:
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX6-NEXT: s_mul_i32 s0, s0, 5
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX6-NEXT: s_mul_i32 s4, s4, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB4_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB4_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8-NEXT: s_mul_i32 s0, s0, 5
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8-NEXT: s_mul_i32 s4, s4, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB4_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB4_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9-NEXT: s_mul_i32 s0, s0, 5
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: s_mul_i32 s4, s4, 5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB4_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
-; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB4_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX10W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX10W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB4_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_mov_b32 s5, exec_lo
+; GFX10W32-NEXT: s_mov_b32 s3, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB4_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5
-; GFX10W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX10W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
+; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc
; GFX10W32-NEXT: .LBB4_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB4_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB4_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: sub_i32_constant:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s2, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB4_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
+; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB4_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -1064,161 +1052,157 @@ entry:
define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
; GFX6-LABEL: sub_i32_uniform:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB5_2
; GFX6-NEXT: ; %bb.1:
-; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mul_i32 s0, s8, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX6-NEXT: s_mul_i32 s4, s6, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX6-NEXT: .LBB5_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX8-NEXT: s_mov_b64 s[4:5], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB5_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s0, s8, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX8-NEXT: s_mul_i32 s4, s6, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB5_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX9-NEXT: s_mov_b64 s[4:5], exec
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s0, s8, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX9-NEXT: s_mul_i32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB5_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_clause 0x1
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44
+; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44
; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB5_2
; GFX10W64-NEXT: ; %bb.1:
-; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: s_mul_i32 s0, s8, s0
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX10W64-NEXT: s_mul_i32 s4, s6, s4
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_clause 0x1
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44
-; GFX10W32-NEXT: s_mov_b32 s6, exec_lo
+; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44
+; GFX10W32-NEXT: s_mov_b32 s4, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB5_2
; GFX10W32-NEXT: ; %bb.1:
; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: s_mul_i32 s0, s4, s0
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_mul_i32 s4, s2, s4
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s4
; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_clause 0x1
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44
+; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1226,56 +1210,56 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %in
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB5_2
; GFX11W64-NEXT: ; %bb.1:
-; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_mul_i32 s0, s8, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s6, s4
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB5_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: sub_i32_uniform:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_clause 0x1
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44
-; GFX11W32-NEXT: s_mov_b32 s6, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44
+; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB5_2
; GFX11W32-NEXT: ; %bb.1:
; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: s_mul_i32 s0, s4, s0
+; GFX11W32-NEXT: s_mul_i32 s4, s2, s4
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s4
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB5_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -1299,17 +1283,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
;
; GFX8-LABEL: sub_i32_varying_vdata:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1321,44 +1304,44 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT: v_readlane_b32 s6, v2, 63
+; GFX8-NEXT: v_readlane_b32 s4, v2, 63
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB6_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB6_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dword v[3:4], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying_vdata:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1370,28 +1353,29 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT: v_readlane_b32 s6, v2, 63
+; GFX9-NEXT: v_readlane_b32 s4, v2, 63
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB6_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -1412,43 +1396,41 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W64-NEXT: v_mov_b32_e32 v2, s4
; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15
; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10W64-NEXT: ; implicit-def: $vgpr0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB6_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX10W64-NEXT: v_mov_b32_e32 v0, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX10W64-NEXT: .LBB6_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, v3
; GFX10W64-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -1459,44 +1441,42 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_mov_b32_e32 v2, v1
; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31
+; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10W32-NEXT: ; implicit-def: $vgpr0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB6_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX10W32-NEXT: s_mov_b32 s5, s6
+; GFX10W32-NEXT: s_mov_b32 s3, s4
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, v3
; GFX10W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -1522,46 +1502,44 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W64-NEXT: v_mov_b32_e32 v2, s4
; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX11W64-NEXT: ; implicit-def: $vgpr0
-; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX11W64-NEXT: s_cbranch_execz .LBB6_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX11W64-NEXT: v_mov_b32_e32 v0, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB6_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, v3
; GFX11W64-NEXT: v_mov_b32_e32 v4, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
@@ -1572,8 +1550,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1581,42 +1560,39 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_mov_b32_e32 v2, v1
; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11W32-NEXT: ; implicit-def: $vgpr0
-; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX11W32-NEXT: s_cbranch_execz .LBB6_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX11W32-NEXT: s_mov_b32 s5, s6
+; GFX11W32-NEXT: s_mov_b32 s3, s4
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: v_mov_b32_e32 v0, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
+; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB6_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, v3
; GFX11W32-NEXT: v_mov_b32_e32 v4, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index f1fba74ef1b62..4214db631135f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -16,207 +16,207 @@ declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32,
define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
; GFX6-LABEL: add_i32_constant:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB0_2
; GFX6-NEXT: ; %bb.1:
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX6-NEXT: s_mul_i32 s0, s0, 5
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX6-NEXT: s_mul_i32 s4, s4, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX6-NEXT: .LBB0_2:
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0
-; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
+; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB0_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8-NEXT: s_mul_i32 s0, s0, 5
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8-NEXT: s_mul_i32 s4, s4, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB0_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9-NEXT: s_mul_i32 s0, s0, 5
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: s_mul_i32 s4, s4, 5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB0_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
-; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB0_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX10W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX10W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_mov_b32 s5, exec_lo
+; GFX10W32-NEXT: s_mov_b32 s3, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB0_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX10W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX10W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
+; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[4:7], 0 idxen glc
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB0_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX11W64-NEXT: s_mul_i32 s0, s0, 5
+; GFX11W64-NEXT: s_mul_i32 s4, s4, 5
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB0_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: add_i32_constant:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s2, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB0_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX11W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX11W32-NEXT: s_mul_i32 s0, s0, 5
+; GFX11W32-NEXT: s_mul_i32 s3, s3, 5
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
+; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB0_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -228,221 +228,217 @@ entry:
define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
; GFX6-LABEL: add_i32_uniform:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB1_2
; GFX6-NEXT: ; %bb.1:
-; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mul_i32 s0, s8, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: s_mul_i32 s4, s6, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: v_mov_b32_e32 v2, 0
-; GFX6-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
+; GFX6-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX6-NEXT: .LBB1_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB1_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s0, s8, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: s_mul_i32 s4, s6, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, 0
-; GFX8-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
+; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB1_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB1_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s0, s8, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_mul_i32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
+; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB1_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_clause 0x1
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44
-; GFX10W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44
+; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
-; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB1_2
; GFX10W64-NEXT: ; %bb.1:
-; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX10W64-NEXT: v_mov_b32_e32 v2, 0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: s_mul_i32 s0, s8, s0
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
+; GFX10W64-NEXT: s_mul_i32 s4, s6, s4
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_clause 0x1
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44
-; GFX10W32-NEXT: s_mov_b32 s6, exec_lo
+; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44
+; GFX10W32-NEXT: s_mov_b32 s4, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB1_2
; GFX10W32-NEXT: ; %bb.1:
; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX10W32-NEXT: v_mov_b32_e32 v2, 0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: s_mul_i32 s0, s4, s0
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_mul_i32 s4, s2, s4
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s4
; GFX10W32-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_clause 0x1
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB1_2
; GFX11W64-NEXT: ; %bb.1:
-; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: v_mov_b32_e32 v2, 0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_mul_i32 s0, s8, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s6, s4
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[12:15], 0 idxen glc
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB1_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: add_i32_uniform:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_clause 0x1
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44
-; GFX11W32-NEXT: s_mov_b32 s6, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44
+; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB1_2
; GFX11W32-NEXT: ; %bb.1:
; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX11W32-NEXT: v_mov_b32_e32 v2, 0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: s_mul_i32 s0, s4, s0
+; GFX11W32-NEXT: s_mul_i32 s4, s2, s4
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s4
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W32-NEXT: .LBB1_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -467,17 +463,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
;
; GFX8-LABEL: add_i32_varying_vdata:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -489,45 +484,45 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT: v_readlane_b32 s6, v2, 63
+; GFX8-NEXT: v_readlane_b32 s4, v2, 63
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB2_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB2_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dword v[3:4], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying_vdata:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -539,29 +534,30 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT: v_readlane_b32 s6, v2, 63
+; GFX9-NEXT: v_readlane_b32 s4, v2, 63
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB2_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -582,44 +578,42 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W64-NEXT: v_mov_b32_e32 v2, s4
; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15
; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10W64-NEXT: ; implicit-def: $vgpr0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB2_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX10W64-NEXT: v_mov_b32_e32 v0, s4
; GFX10W64-NEXT: v_mov_b32_e32 v4, 0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
; GFX10W64-NEXT: .LBB2_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, v3
; GFX10W64-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -630,45 +624,43 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_mov_b32_e32 v2, v1
; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31
+; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10W32-NEXT: ; implicit-def: $vgpr0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB2_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: v_mov_b32_e32 v0, s6
+; GFX10W32-NEXT: s_mov_b32 s3, s4
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W32-NEXT: s_mov_b32 s5, s6
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
+; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[4:7], 0 idxen glc
; GFX10W32-NEXT: .LBB2_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, v3
; GFX10W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -694,47 +686,45 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W64-NEXT: v_mov_b32_e32 v2, s4
; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX11W64-NEXT: ; implicit-def: $vgpr0
-; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX11W64-NEXT: s_cbranch_execz .LBB2_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX11W64-NEXT: v_mov_b32_e32 v0, s4
; GFX11W64-NEXT: v_mov_b32_e32 v4, 0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB2_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, v3
; GFX11W64-NEXT: v_mov_b32_e32 v4, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
@@ -745,8 +735,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -754,43 +745,40 @@ define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_mov_b32_e32 v2, v1
; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11W32-NEXT: ; implicit-def: $vgpr0
-; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX11W32-NEXT: s_cbranch_execz .LBB2_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: v_mov_b32_e32 v0, s6
+; GFX11W32-NEXT: s_mov_b32 s3, s4
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: v_mov_b32_e32 v0, s3
; GFX11W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX11W32-NEXT: s_mov_b32 s5, s6
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc
+; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB2_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, v3
; GFX11W32-NEXT: v_mov_b32_e32 v4, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -975,214 +963,214 @@ entry:
define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
; GFX6-LABEL: sub_i32_constant:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB5_2
; GFX6-NEXT: ; %bb.1:
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX6-NEXT: s_mul_i32 s0, s0, 5
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX6-NEXT: s_mul_i32 s4, s4, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX6-NEXT: .LBB5_2:
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB5_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX8-NEXT: s_mul_i32 s0, s0, 5
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX8-NEXT: s_mul_i32 s4, s4, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB5_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX9-NEXT: s_mul_i32 s0, s0, 5
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: s_mul_i32 s4, s4, 5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB5_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
-; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB5_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX10W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX10W64-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_mov_b32 s5, exec_lo
+; GFX10W32-NEXT: s_mov_b32 s3, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB5_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX10W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX10W32-NEXT: s_mul_i32 s0, s0, 5
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_mul_i32 s3, s3, 5
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s3
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
+; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[4:7], 0 idxen glc
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB5_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX11W64-NEXT: s_mul_i32 s0, s0, 5
+; GFX11W64-NEXT: s_mul_i32 s4, s4, 5
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB5_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: sub_i32_constant:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s2, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB5_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX11W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX11W32-NEXT: s_mul_i32 s0, s0, 5
+; GFX11W32-NEXT: s_mul_i32 s3, s3, 5
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s3
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
+; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB5_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -1194,225 +1182,221 @@ entry:
define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
; GFX6-LABEL: sub_i32_uniform:
; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_mov_b64 s[2:3], exec
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11
-; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX6-NEXT: s_mov_b64 s[4:5], exec
+; GFX6-NEXT: s_load_dword s6, s[0:1], 0x11
+; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: ; implicit-def: $vgpr1
-; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX6-NEXT: s_cbranch_execz .LBB6_2
; GFX6-NEXT: ; %bb.1:
-; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd
-; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_mul_i32 s0, s8, s0
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
+; GFX6-NEXT: s_mul_i32 s4, s6, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: v_mov_b32_e32 v2, 0
-; GFX6-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
+; GFX6-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX6-NEXT: .LBB6_2:
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_readfirstlane_b32 s0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: v_mul_lo_u32 v0, s6, v0
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_load_dword s6, s[0:1], 0x44
+; GFX8-NEXT: s_mov_b64 s[4:5], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB6_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s0, s8, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: s_mul_i32 s4, s6, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, 0
-; GFX8-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
+; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB6_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_load_dword s6, s[0:1], 0x44
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s0, s8, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_mul_i32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
+; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB6_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
; GFX10W64: ; %bb.0: ; %entry
-; GFX10W64-NEXT: s_clause 0x1
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44
-; GFX10W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX10W64-NEXT: s_load_dword s6, s[0:1], 0x44
+; GFX10W64-NEXT: s_mov_b64 s[4:5], exec
; GFX10W64-NEXT: ; implicit-def: $vgpr1
-; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB6_2
; GFX10W64-NEXT: ; %bb.1:
-; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
-; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX10W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX10W64-NEXT: v_mov_b32_e32 v2, 0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: s_mul_i32 s0, s8, s0
-; GFX10W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
+; GFX10W64-NEXT: s_mul_i32 s4, s6, s4
+; GFX10W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX10W64-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX10W64-NEXT: .LBB6_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
; GFX10W32: ; %bb.0: ; %entry
-; GFX10W32-NEXT: s_clause 0x1
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44
-; GFX10W32-NEXT: s_mov_b32 s6, exec_lo
+; GFX10W32-NEXT: s_load_dword s2, s[0:1], 0x44
+; GFX10W32-NEXT: s_mov_b32 s4, exec_lo
; GFX10W32-NEXT: ; implicit-def: $vgpr1
-; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB6_2
; GFX10W32-NEXT: ; %bb.1:
; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX10W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX10W32-NEXT: v_mov_b32_e32 v2, 0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: s_mul_i32 s0, s4, s0
-; GFX10W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX10W32-NEXT: s_mul_i32 s4, s2, s4
+; GFX10W32-NEXT: v_mov_b32_e32 v1, s4
; GFX10W32-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
; GFX11W64: ; %bb.0: ; %entry
-; GFX11W64-NEXT: s_clause 0x1
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44
-; GFX11W64-NEXT: s_mov_b64 s[6:7], exec
+; GFX11W64-NEXT: s_load_b32 s6, s[0:1], 0x44
; GFX11W64-NEXT: s_mov_b64 s[4:5], exec
-; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W64-NEXT: s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W64-NEXT: ; implicit-def: $vgpr1
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W64-NEXT: s_cbranch_execz .LBB6_2
; GFX11W64-NEXT: ; %bb.1:
-; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34
-; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
+; GFX11W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX11W64-NEXT: v_mov_b32_e32 v2, 0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: s_mul_i32 s0, s8, s0
+; GFX11W64-NEXT: s_mul_i32 s4, s6, s4
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[12:15], 0 idxen glc
+; GFX11W64-NEXT: v_mov_b32_e32 v1, s4
+; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB6_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0
+; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
; GFX11W32-LABEL: sub_i32_uniform:
; GFX11W32: ; %bb.0: ; %entry
-; GFX11W32-NEXT: s_clause 0x1
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44
-; GFX11W32-NEXT: s_mov_b32 s6, exec_lo
-; GFX11W32-NEXT: s_mov_b32 s5, exec_lo
-; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX11W32-NEXT: s_load_b32 s2, s[0:1], 0x44
+; GFX11W32-NEXT: s_mov_b32 s4, exec_lo
+; GFX11W32-NEXT: s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX11W32-NEXT: ; implicit-def: $vgpr1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11W32-NEXT: s_cbranch_execz .LBB6_2
; GFX11W32-NEXT: ; %bb.1:
; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6
+; GFX11W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX11W32-NEXT: v_mov_b32_e32 v2, 0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: s_mul_i32 s0, s4, s0
+; GFX11W32-NEXT: s_mul_i32 s4, s2, s4
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX11W32-NEXT: v_mov_b32_e32 v1, s4
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W32-NEXT: .LBB6_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
@@ -1437,17 +1421,16 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
;
; GFX8-LABEL: sub_i32_varying_vdata:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
-; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1459,45 +1442,45 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT: v_readlane_b32 s6, v2, 63
+; GFX8-NEXT: v_readlane_b32 s4, v2, 63
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB7_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB7_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dword v[3:4], v0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying_vdata:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1509,29 +1492,30 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT: v_readlane_b32 s6, v2, 63
+; GFX9-NEXT: v_readlane_b32 s4, v2, 63
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB7_2
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB7_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -1552,44 +1536,42 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W64-NEXT: v_mov_b32_e32 v2, s4
; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15
; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX10W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX10W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10W64-NEXT: ; implicit-def: $vgpr0
-; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10W64-NEXT: s_cbranch_execz .LBB7_2
; GFX10W64-NEXT: ; %bb.1:
; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX10W64-NEXT: v_mov_b32_e32 v0, s4
; GFX10W64-NEXT: v_mov_b32_e32 v4, 0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc
; GFX10W64-NEXT: .LBB7_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, v3
; GFX10W64-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -1600,45 +1582,43 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10W32-NEXT: v_mov_b32_e32 v2, v1
; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31
+; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX10W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX10W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10W32-NEXT: ; implicit-def: $vgpr0
-; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10W32-NEXT: s_cbranch_execz .LBB7_2
; GFX10W32-NEXT: ; %bb.1:
-; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W32-NEXT: v_mov_b32_e32 v0, s6
+; GFX10W32-NEXT: s_mov_b32 s3, s4
+; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W32-NEXT: s_mov_b32 s5, s6
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc
+; GFX10W32-NEXT: buffer_atomic_sub v0, v4, s[4:7], 0 idxen glc
; GFX10W32-NEXT: .LBB7_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, v3
; GFX10W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3]
+; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -1664,47 +1644,45 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W64-NEXT: v_mov_b32_e32 v2, s4
; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15
+; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31
+; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16
; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31
-; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63
-; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47
-; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63
+; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47
+; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48
-; GFX11W64-NEXT: s_mov_b64 exec, s[4:5]
+; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48
+; GFX11W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX11W64-NEXT: ; implicit-def: $vgpr0
-; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX11W64-NEXT: s_cbranch_execz .LBB7_2
; GFX11W64-NEXT: ; %bb.1:
; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT: v_mov_b32_e32 v0, s6
+; GFX11W64-NEXT: v_mov_b32_e32 v0, s4
; GFX11W64-NEXT: v_mov_b32_e32 v4, 0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB7_2:
-; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, v3
; GFX11W64-NEXT: v_mov_b32_e32 v4, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
;
@@ -1715,8 +1693,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1724,43 +1703,40 @@ define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i3
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_mov_b32_e32 v2, v1
; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT: v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31
; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16
-; GFX11W32-NEXT: s_mov_b32 exec_lo, s4
+; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1
+; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16
+; GFX11W32-NEXT: s_mov_b32 exec_lo, s2
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11W32-NEXT: ; implicit-def: $vgpr0
-; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX11W32-NEXT: s_cbranch_execz .LBB7_2
; GFX11W32-NEXT: ; %bb.1:
-; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W32-NEXT: v_mov_b32_e32 v0, s6
+; GFX11W32-NEXT: s_mov_b32 s3, s4
+; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT: v_mov_b32_e32 v0, s3
; GFX11W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX11W32-NEXT: s_mov_b32 s5, s6
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc
+; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v4, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB7_2:
-; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, v3
; GFX11W32-NEXT: v_mov_b32_e32 v4, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3]
+; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1]
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
index d0eecc6973e77..2f98eed6da872 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll
@@ -6,7 +6,6 @@
; instruction, rather than 8 in previous generations.
; GCN-LABEL: {{^}}long_forward_branch_gfx10only:
-; GFX9: s_load_dwordx2
; GFX9: s_cmp_eq_u32
; GFX9-NEXT: s_cbranch_scc1
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 6af32a90d8527..e9b6c7f8d5fb4 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -244,9 +244,8 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia
; because the implicit immediate already uses the constant bus.
; On GFX10+ we can use two scalar operands.
; GCN-LABEL: {{^}}madak_constant_bus_violation:
-; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
-
; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
+; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000
; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5
; GFX10: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index b180df0782823..9df93bc970a96 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -65,8 +65,8 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
-; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
; FLATSCR-NEXT: s_mov_b32 s32, s2
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index c2454772ebfaf..4529dc5f1d213 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -970,31 +970,30 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
;
; GCN-IR-LABEL: s_test_sdiv24_48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
; GCN-IR-NEXT: s_mov_b32 s15, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
+; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24
; GCN-IR-NEXT: s_sext_i32_i16 s7, s7
-; GCN-IR-NEXT: s_sext_i32_i16 s1, s1
-; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 24
-; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[0:1], 16
-; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31
-; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 16
-; GCN-IR-NEXT: s_mov_b32 s1, s0
-; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[6:7], 16
-; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9]
+; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24
+; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16
+; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16
; GCN-IR-NEXT: s_mov_b32 s3, s2
-; GCN-IR-NEXT: s_sub_u32 s12, s6, s0
-; GCN-IR-NEXT: s_subb_u32 s13, s7, s0
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[10:11]
-; GCN-IR-NEXT: s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT: s_subb_u32 s7, s7, s2
+; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16
+; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31
+; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7]
+; GCN-IR-NEXT: s_mov_b32 s5, s4
+; GCN-IR-NEXT: s_sub_u32 s12, s6, s2
+; GCN-IR-NEXT: s_subb_u32 s13, s7, s2
+; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s6, s6, s4
+; GCN-IR-NEXT: s_subb_u32 s7, s7, s4
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
-; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[10:11]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6
; GCN-IR-NEXT: s_add_i32 s8, s8, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7
@@ -1003,28 +1002,28 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: s_add_i32 s8, s8, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13
; GCN-IR-NEXT: s_min_u32 s18, s8, s9
-; GCN-IR-NEXT: s_sub_u32 s10, s14, s18
-; GCN-IR-NEXT: s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[10:11], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[10:11], 63
-; GCN-IR-NEXT: s_or_b64 s[20:21], s[16:17], s[20:21]
-; GCN-IR-NEXT: s_and_b64 s[16:17], s[20:21], exec
-; GCN-IR-NEXT: s_cselect_b32 s17, 0, s13
-; GCN-IR-NEXT: s_cselect_b32 s16, 0, s12
+; GCN-IR-NEXT: s_sub_u32 s16, s14, s18
+; GCN-IR-NEXT: s_subb_u32 s17, 0, 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[16:17], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
+; GCN-IR-NEXT: s_or_b64 s[20:21], s[10:11], s[20:21]
+; GCN-IR-NEXT: s_and_b64 s[10:11], s[20:21], exec
+; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13
+; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12
; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23]
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21]
; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s16, s10, 1
-; GCN-IR-NEXT: s_addc_u32 s17, s11, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[16:17], 0
-; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10
+; GCN-IR-NEXT: s_add_u32 s20, s16, 1
+; GCN-IR-NEXT: s_addc_u32 s21, s17, 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[20:21], 0
+; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
+; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16
; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s16
+; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s20
; GCN-IR-NEXT: s_add_u32 s19, s6, -1
; GCN-IR-NEXT: s_addc_u32 s20, s7, -1
; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15]
@@ -1055,19 +1054,21 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3
; GCN-IR-NEXT: .LBB9_4: ; %Flow3
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1
-; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[6:7]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7]
; GCN-IR-NEXT: .LBB9_5: ; %udiv-end
-; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT: s_xor_b64 s[2:3], s[16:17], s[0:1]
+; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9
+; GCN-IR-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1]
; GCN-IR-NEXT: s_sub_u32 s0, s2, s0
; GCN-IR-NEXT: s_subb_u32 s1, s3, s1
-; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s6, -1
+; GCN-IR-NEXT: s_mov_b32 s15, 0xf000
+; GCN-IR-NEXT: s_mov_b32 s14, -1
; GCN-IR-NEXT: v_mov_b32_e32 v0, s1
-; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT: buffer_store_short v0, off, s[12:15], 0 offset:4
; GCN-IR-NEXT: s_waitcnt expcnt(0)
; GCN-IR-NEXT: v_mov_b32_e32 v0, s0
-; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT: buffer_store_dword v0, off, s[12:15], 0
; GCN-IR-NEXT: s_endpgm
%1 = ashr i48 %x, 24
%2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index bf9041c15969b..fce79701b8b0e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -11,26 +11,27 @@
define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
; SI-LABEL: sgpr_if_else_salu_br:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xf
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dword s2, s[0:1], 0xf
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_cmp_lg_u32 s8, 0
+; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: s_cbranch_scc0 .LBB0_4
; SI-NEXT: ; %bb.1: ; %else
-; SI-NEXT: s_add_i32 s2, s11, s0
+; SI-NEXT: s_add_i32 s7, s7, s2
; SI-NEXT: s_cbranch_execnz .LBB0_3
; SI-NEXT: .LBB0_2: ; %if
-; SI-NEXT: s_sub_i32 s2, s9, s10
+; SI-NEXT: s_sub_i32 s7, s5, s6
; SI-NEXT: .LBB0_3: ; %endif
-; SI-NEXT: s_add_i32 s0, s2, s8
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_add_i32 s4, s7, s4
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
; SI-NEXT: .LBB0_4:
-; SI-NEXT: ; implicit-def: $sgpr2
+; SI-NEXT: ; implicit-def: $sgpr7
; SI-NEXT: s_branch .LBB0_2
entry:
@@ -55,31 +56,32 @@ endif:
define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) {
; SI-LABEL: sgpr_if_else_salu_br_opt:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dword s6, s[0:1], 0x13
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dword s4, s[0:1], 0x13
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_cmp_lg_u32 s6, 0
+; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: s_cbranch_scc0 .LBB1_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_load_dword s2, s[0:1], 0x2e
; SI-NEXT: s_load_dword s3, s[0:1], 0x37
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_add_i32 s7, s2, s3
+; SI-NEXT: s_add_i32 s5, s2, s3
; SI-NEXT: s_cbranch_execnz .LBB1_3
; SI-NEXT: .LBB1_2: ; %if
; SI-NEXT: s_load_dword s2, s[0:1], 0x1c
-; SI-NEXT: s_load_dword s0, s[0:1], 0x25
+; SI-NEXT: s_load_dword s3, s[0:1], 0x25
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_add_i32 s7, s2, s0
+; SI-NEXT: s_add_i32 s5, s2, s3
; SI-NEXT: .LBB1_3: ; %endif
-; SI-NEXT: s_add_i32 s0, s7, s6
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_add_i32 s4, s5, s4
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
; SI-NEXT: .LBB1_4:
-; SI-NEXT: ; implicit-def: $sgpr7
+; SI-NEXT: ; implicit-def: $sgpr5
; SI-NEXT: s_branch .LBB1_2
entry:
@@ -106,30 +108,32 @@ endif:
define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
; SI-LABEL: sgpr_if_else_valu_br:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xc
; SI-NEXT: v_cvt_f32_u32_e32 v0, v0
; SI-NEXT: ; implicit-def: $sgpr8
; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0
-; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; SI-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; SI-NEXT: s_cbranch_execz .LBB2_2
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_add_i32 s8, s2, s3
+; SI-NEXT: s_add_i32 s8, s6, s7
; SI-NEXT: .LBB2_2: ; %Flow
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
+; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: s_xor_b64 exec, exec, s[2:3]
+; SI-NEXT: s_cbranch_execz .LBB2_4
; SI-NEXT: ; %bb.3: ; %if
-; SI-NEXT: s_add_i32 s0, s0, s1
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: ; %bb.4: ; %endif
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s4, s4, s5
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: .LBB2_4: ; %endif
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 6de25193fc2a5..9af9894110c07 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -29,6 +29,9 @@ define amdgpu_kernel void @kernel(i32 %a, i32 addrspace(1)* %x, i32 noundef %n)
; UNIFY-NEXT: call void @llvm.trap()
; UNIFY-NEXT: br label %UnifiedUnreachableBlock
; UNIFY-LABEL: if.end6.sink.split:
+; UNIFY-NEXT: %x.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %kernel.kernarg.segment, i64 8
+; UNIFY-NEXT: %x.kernarg.offset.cast = bitcast i8 addrspace(4)* %x.kernarg.offset to i32 addrspace(1)* addrspace(4)*
+; UNIFY-NEXT: %x.load = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %x.kernarg.offset.cast, align 8, !invariant.load !0
; UNIFY-NEXT: %idxprom = sext i32 %tid to i64
; UNIFY-NEXT: %x1 = getelementptr inbounds i32, i32 addrspace(1)* %x.load, i64 %idxprom
; UNIFY-NEXT: store i32 %a.load, i32 addrspace(1)* %x1, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index f3ee132041722..8fdf6d1683ebb 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1157,38 +1157,37 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
;
; GCN-IR-LABEL: s_test_srem24_48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GCN-IR-NEXT: s_mov_b32 s13, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
; GCN-IR-NEXT: s_sext_i32_i16 s7, s7
-; GCN-IR-NEXT: s_sext_i32_i16 s1, s1
-; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 24
-; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 24
+; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24
+; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24
; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[0:1], 16
-; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31
-; GCN-IR-NEXT: s_ashr_i32 s12, s7, 31
-; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 16
-; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[6:7], 16
-; GCN-IR-NEXT: s_mov_b32 s1, s0
-; GCN-IR-NEXT: s_mov_b32 s13, s12
-; GCN-IR-NEXT: s_xor_b64 s[2:3], s[8:9], s[0:1]
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[10:11], s[12:13]
-; GCN-IR-NEXT: s_sub_u32 s2, s2, s0
-; GCN-IR-NEXT: s_subb_u32 s3, s3, s0
-; GCN-IR-NEXT: s_sub_u32 s6, s6, s12
-; GCN-IR-NEXT: s_subb_u32 s7, s7, s12
+; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16
+; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16
+; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31
+; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16
+; GCN-IR-NEXT: s_mov_b32 s3, s2
+; GCN-IR-NEXT: s_mov_b32 s11, s10
+; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3]
+; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11]
+; GCN-IR-NEXT: s_sub_u32 s4, s4, s2
+; GCN-IR-NEXT: s_subb_u32 s5, s5, s2
+; GCN-IR-NEXT: s_sub_u32 s6, s6, s10
+; GCN-IR-NEXT: s_subb_u32 s7, s7, s10
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT: s_mov_b32 s13, 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11]
; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6
; GCN-IR-NEXT: s_add_i32 s8, s8, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7
; GCN-IR-NEXT: s_min_u32 s12, s8, s9
-; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2
+; GCN-IR-NEXT: s_flbit_i32_b32 s8, s4
; GCN-IR-NEXT: s_add_i32 s8, s8, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3
+; GCN-IR-NEXT: s_flbit_i32_b32 s9, s5
; GCN-IR-NEXT: s_min_u32 s16, s8, s9
; GCN-IR-NEXT: s_sub_u32 s14, s12, s16
; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
@@ -1196,8 +1195,8 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 63
; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19]
; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec
-; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3
-; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2
+; GCN-IR-NEXT: s_cselect_b32 s11, 0, s5
+; GCN-IR-NEXT: s_cselect_b32 s10, 0, s4
; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21]
; GCN-IR-NEXT: s_mov_b64 s[8:9], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19]
@@ -1208,10 +1207,10 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14
+; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s14
; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s18
+; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[4:5], s18
; GCN-IR-NEXT: s_add_u32 s18, s6, -1
; GCN-IR-NEXT: s_addc_u32 s19, s7, -1
; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13]
@@ -1246,24 +1245,26 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: .LBB9_5: ; %udiv-end
; GCN-IR-NEXT: v_mov_b32_e32 v0, s10
; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-IR-NEXT: s_mul_i32 s8, s6, s11
-; GCN-IR-NEXT: s_mul_i32 s7, s7, s10
-; GCN-IR-NEXT: s_mul_i32 s6, s6, s10
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s8, v0
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s7, v0
-; GCN-IR-NEXT: v_mov_b32_e32 v1, s6
-; GCN-IR-NEXT: v_mov_b32_e32 v2, s3
-; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s2, v1
+; GCN-IR-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9
+; GCN-IR-NEXT: s_mul_i32 s0, s6, s11
+; GCN-IR-NEXT: v_mov_b32_e32 v2, s5
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT: s_mul_i32 s0, s7, s10
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT: s_mul_i32 s0, s6, s10
+; GCN-IR-NEXT: v_mov_b32_e32 v1, s0
+; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s4, v1
; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; GCN-IR-NEXT: v_xor_b32_e32 v1, s0, v1
-; GCN-IR-NEXT: v_xor_b32_e32 v0, s1, v0
-; GCN-IR-NEXT: v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v1, s2, v1
+; GCN-IR-NEXT: v_xor_b32_e32 v0, s3, v0
+; GCN-IR-NEXT: v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1
+; GCN-IR-NEXT: s_mov_b32 s15, 0xf000
+; GCN-IR-NEXT: s_mov_b32 s14, -1
; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v0, v2, vcc
-; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s6, -1
-; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT: buffer_store_short v0, off, s[12:15], 0 offset:4
+; GCN-IR-NEXT: buffer_store_dword v1, off, s[12:15], 0
; GCN-IR-NEXT: s_endpgm
%1 = ashr i48 %x, 24
%2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index f652331362659..62eea86ed8f75 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -7,28 +7,28 @@ target triple="amdgcn--"
define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
; CHECK-LABEL: foobar:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_mov_b32 s6, -1
+; CHECK-NEXT: s_mov_b32 s2, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
-; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v1, s5
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
+; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc
; CHECK-NEXT: ; %bb.1: ; %ift
-; CHECK-NEXT: s_mov_b32 s0, s1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: s_mov_b32 s4, s5
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v1, s5
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: ; %bb.2: ; %ife
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: s_mov_b32 s7, 0xf000
-; CHECK-NEXT: s_mov_b32 s4, s2
-; CHECK-NEXT: s_mov_b32 s5, s3
-; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; CHECK-NEXT: s_mov_b32 s3, 0xf000
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0
; CHECK-NEXT: s_endpgm
; FIXME: The change related to the fact that
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 9c99ef27a0748..3c3c77b1a5331 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -783,90 +783,90 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
;
; GCN-IR-LABEL: s_test_udiv24_i48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
; GCN-IR-NEXT: s_mov_b32 s11, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_and_b32 s3, s7, 0xffff
-; GCN-IR-NEXT: s_and_b32 s2, s6, 0xff000000
-; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff
-; GCN-IR-NEXT: s_and_b32 s0, s0, 0xff000000
+; GCN-IR-NEXT: s_and_b32 s3, s5, 0xffff
+; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000
+; GCN-IR-NEXT: s_and_b32 s5, s7, 0xffff
+; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000
; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24
-; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[0:1], 24
+; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24
; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff
-; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0
+; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[2:3], s[6:7]
-; GCN-IR-NEXT: s_flbit_i32_b32 s2, s0
-; GCN-IR-NEXT: s_add_i32 s2, s2, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s3, s1
-; GCN-IR-NEXT: s_min_u32 s10, s2, s3
-; GCN-IR-NEXT: s_flbit_i32_b32 s2, s8
-; GCN-IR-NEXT: s_add_i32 s2, s2, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s3, s9
-; GCN-IR-NEXT: s_min_u32 s14, s2, s3
-; GCN-IR-NEXT: s_sub_u32 s6, s10, s14
-; GCN-IR-NEXT: s_subb_u32 s7, 0, 0
-; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[6:7], 63
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[6:7], 63
-; GCN-IR-NEXT: s_or_b64 s[16:17], s[12:13], s[16:17]
-; GCN-IR-NEXT: s_and_b64 s[12:13], s[16:17], exec
-; GCN-IR-NEXT: s_cselect_b32 s13, 0, s9
-; GCN-IR-NEXT: s_cselect_b32 s12, 0, s8
+; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7]
+; GCN-IR-NEXT: s_flbit_i32_b32 s4, s2
+; GCN-IR-NEXT: s_add_i32 s4, s4, 32
+; GCN-IR-NEXT: s_flbit_i32_b32 s5, s3
+; GCN-IR-NEXT: s_min_u32 s10, s4, s5
+; GCN-IR-NEXT: s_flbit_i32_b32 s4, s8
+; GCN-IR-NEXT: s_add_i32 s4, s4, 32
+; GCN-IR-NEXT: s_flbit_i32_b32 s5, s9
+; GCN-IR-NEXT: s_min_u32 s14, s4, s5
+; GCN-IR-NEXT: s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT: s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT: s_or_b64 s[16:17], s[6:7], s[16:17]
+; GCN-IR-NEXT: s_and_b64 s[6:7], s[16:17], exec
+; GCN-IR-NEXT: s_cselect_b32 s7, 0, s9
+; GCN-IR-NEXT: s_cselect_b32 s6, 0, s8
; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT: s_mov_b64 s[2:3], 0
+; GCN-IR-NEXT: s_mov_b64 s[4:5], 0
; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17]
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT: s_add_u32 s12, s6, 1
-; GCN-IR-NEXT: s_addc_u32 s13, s7, 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 0
-; GCN-IR-NEXT: s_sub_i32 s6, 63, s6
-; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s6
+; GCN-IR-NEXT: s_add_u32 s16, s12, 1
+; GCN-IR-NEXT: s_addc_u32 s17, s13, 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[16:17], 0
+; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
+; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7]
+; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], s12
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s12
-; GCN-IR-NEXT: s_add_u32 s15, s0, -1
-; GCN-IR-NEXT: s_addc_u32 s16, s1, -1
-; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11]
-; GCN-IR-NEXT: s_add_u32 s8, s2, s14
-; GCN-IR-NEXT: s_addc_u32 s9, s3, 0
+; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s16
+; GCN-IR-NEXT: s_add_u32 s15, s2, -1
+; GCN-IR-NEXT: s_addc_u32 s16, s3, -1
+; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11]
+; GCN-IR-NEXT: s_add_u32 s8, s4, s14
+; GCN-IR-NEXT: s_addc_u32 s9, s5, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT: s_mov_b32 s3, 0
+; GCN-IR-NEXT: s_mov_b32 s5, 0
; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while
; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT: s_lshr_b32 s2, s7, 31
+; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31
; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3]
+; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5]
; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s2, s15, s12
-; GCN-IR-NEXT: s_subb_u32 s2, s16, s13
-; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31
+; GCN-IR-NEXT: s_sub_u32 s4, s15, s12
+; GCN-IR-NEXT: s_subb_u32 s4, s16, s13
+; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31
; GCN-IR-NEXT: s_mov_b32 s11, s10
-; GCN-IR-NEXT: s_and_b32 s2, s10, 1
-; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[0:1]
+; GCN-IR-NEXT: s_and_b32 s4, s10, 1
+; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[2:3]
; GCN-IR-NEXT: s_sub_u32 s12, s12, s10
; GCN-IR-NEXT: s_subb_u32 s13, s13, s11
; GCN-IR-NEXT: s_add_u32 s8, s8, 1
; GCN-IR-NEXT: s_addc_u32 s9, s9, 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 0
-; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3]
+; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19]
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3
; GCN-IR-NEXT: .LBB7_4: ; %Flow3
-; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
-; GCN-IR-NEXT: s_or_b64 s[12:13], s[2:3], s[0:1]
+; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1
+; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3]
; GCN-IR-NEXT: .LBB7_5: ; %udiv-end
-; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s6, -1
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s13
-; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT: s_waitcnt expcnt(0)
-; GCN-IR-NEXT: v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT: s_mov_b32 s2, -1
+; GCN-IR-NEXT: v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT: v_mov_b32_e32 v1, s6
+; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
+; GCN-IR-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-IR-NEXT: s_endpgm
%1 = lshr i48 %x, 24
%2 = lshr i48 %y, 24
diff --git a/llvm/test/Transforms/Sink/invariant-load.ll b/llvm/test/Transforms/Sink/invariant-load.ll
new file mode 100644
index 0000000000000..0e5ab2fbf7fef
--- /dev/null
+++ b/llvm/test/Transforms/Sink/invariant-load.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -sink -S < %s | FileCheck %s
+
+; Loads marked invariant can be sunk across critical edges
+
+define <4 x float> @invariant_load(<4 x float> *%in, i32 %s) {
+; CHECK-LABEL: @invariant_load(
+; CHECK-NEXT: main_body:
+; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[S:%.*]], 0
+; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]]
+; CHECK: block:
+; CHECK-NEXT: [[Z:%.*]] = add i32 [[S]], 1
+; CHECK-NEXT: br label [[END]]
+; CHECK: end:
+; CHECK-NEXT: [[V:%.*]] = load <4 x float>, <4 x float>* [[IN:%.*]], align 16, !invariant.load !0
+; CHECK-NEXT: ret <4 x float> [[V]]
+;
+main_body:
+ %v = load <4 x float>, <4 x float> *%in, !invariant.load !0
+ %c = icmp eq i32 %s, 0
+ br i1 %c, label %block, label %end
+block:
+ %z = add i32 %s, 1
+ br label %end
+end:
+ ret <4 x float> %v
+}
+
+!0 = !{}
More information about the llvm-commits
mailing list